In [1]:
import re
import pandas as pd
import numpy as np
import warnings
from bs4 import BeautifulSoup
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer
from wordsegment import segment
from html import unescape
import itertools
import random
warnings.filterwarnings('ignore')

In [2]:
xl_file = pd.ExcelFile("training-Obama-Romney-tweets-sampled.xlsx", encoding='utf-8')

In [3]:
dfs = {sheet_name: xl_file.parse('Romney',encoding='utf-8') 
          for sheet_name in xl_file.sheet_names}
romneyData=dfs.get('Romney')

In [4]:
def clearColumns(dataDF):
    dataDF=dataDF.drop(dataDF.columns[[0, 1, 2, 5]], axis=1)
    dataDF.columns = ['tweet', 'label']
    dataDF = dataDF.ix[1:]
    print(dataDF)
    print(dataDF['label'].unique())
    dataDF=dataDF[dataDF['label'].isin([0,1,-1,u'-1',u'1',u'0'])]
    return dataDF

In [5]:
romneyDataDF = clearColumns(romneyData)
#romneyData

                                                  tweet label
1     Insidious!<e>Mitt Romney</e>'s Bain Helped Phi...    -1
2     Senior <e>Romney</e> Advisor Claims <e>Obama</...     2
3     .@WardBrenda @shortwave8669 @allanbourdius you...    -1
4     <e>Mitt Romney</e> still doesn't <a>believe</a...    -1
5     <e>Romney</e>'s <a>tax plan</a> deserves a 2nd...    -1
6     Hope <e>Romney</e> debate prepped w/ the same ...     1
7     Want to know how <e>Mitt Romney</e> is going t...    -1
8     If <e>Romney</e> wins the <a>presidential elec...    -1
9     Presidential debate round 2: <e>Romney</e> wan...     2
10    Someone on the <e>mitt Romney</e> <a>Facebook ...  !!!!
11    <e>Romney</e>'s <a>12 million jobs scam </a>re...    -1
12    @LoreleiMission "girl look at that body x 3 Th...  !!!!
13    <e>Mitt #Romney</e> <a>said</a> that <a> catch...    -1
14    Please be<e> Mitt Romney</e> “@HuffingtonPost:...     0
15    <e>Romney</e> leads <e>Obama</e> on economy, j...     2
16    #<

In [84]:
# stopwords=['i','me','my','myself','we','our','ours','ourselves','you','your','yours','yourself','yourselves','he','him','his',
#            'himself','she','her','hers','herself','it','its','itself','they','them','their','theirs','themselves','what',
#            'which','who','whom','this','that','these','those','am','is','are','was','were','be','been','being','have','has',
#            'had','having','do','does','did','doing','a','an','the','and','but','if','or','because','as','until','while','of',
#            'at','by','for','with','about','against','between','into','through','during','before','after','above','below','to',
#            'from','up','down','in','out','on','off','over','under','again','further','then','once','here','there','when',
#            'where','why','how','all','any','both','each','few','more','most','other','some','such','only',
#            'own','same','so','than','too','very','can','will','just','should','now','im']

In [6]:
def cleanURLS(tweet):
    tweet=re.sub('http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', '', tweet)
    return tweet
    
def getText_fromHTML(tweet):
    soup = BeautifulSoup(tweet)
    tweet = soup.get_text()
    return tweet

def removeAppostophes(tweet):
    appostophes = {"s": "is", "re": "are", "em": "them", "ll": "will", "t": "it", "m": "am", "ve": "have", "d": "did"}
    words = []
    [words.append(terms) for word in tweet.split() for terms in [appostophes[term] if term in appostophes else term for
                                                                 term in word.split("'")]]
    tweet = " ".join(words)
    return tweet

def splitAttachedwords(tweet):
    tweet = " ".join(segment(tweet))
    return tweet

def lemmatisation(tweet):
    return " ".join(WordNetLemmatizer().lemmatize(word) for word in tweet.split())

def stemming(tweet):
    return " ".join(PorterStemmer().stem(word) for word in tweet.split())

In [7]:
def cleanTweets(dataDF):
    rowsToBeRemoved=[]
    TAG_RE = re.compile(r'<[^>]+>')
    for index, row in dataDF.iterrows():
        tweet=row['tweet']
        if isinstance(row['label'], str) == True:
            row['label']=int(row['label'])
        output=""      
        if isinstance(tweet, str) == False:
            rowsToBeRemoved.append(index)
            continue
        tweet=tweet.lower()
        
        # remove the urls from tweet
        tweet = cleanURLS(tweet)

        # remove HTML tags
        tweet = getText_fromHTML(tweet)
        
        # remove appostophes
        tweet = removeAppostophes(tweet)
        
        # split attached words ex: goodboy = good boy
        tweet = splitAttachedwords(tweet)
        
        for word in tweet.split(" "):
            # removing usernames
            if word.strip().startswith('@'):
                word=""
            # removing hashtags
            if word.strip().startswith('#'):
                word=word[1:]
            # strip punctuation
            word=re.sub(r'[^\w\s]','',word)
            if word != "" and word.isalpha():
                output+=" "+word
        if len(output)<1:
            rowsToBeRemoved.append(index)
        output = lemmatisation(output)
        output = stemming(output)   
        row['tweet']=output.strip()
    return dataDF.dropna()

In [8]:
romneyCleanedData=cleanTweets(romneyDataDF.copy(deep=True))
# romneyCleanedData.dropna(axis=0,inplace=True)

In [9]:
romneyCleanedData.tail()

Unnamed: 0,tweet,label
7790,romney is gonna win,1
7791,i just gain so much more respect of romney,1
7792,is good news add that to the,1
7793,pray for anoth romney big win in debat today,1
7794,is good news add that to the,1


In [10]:
# Shuffling the cleaned data
from sklearn.cross_validation import train_test_split
from numpy.random import random_integers as ri

final_data = romneyCleanedData.copy(deep=True)
for i in range(0,500):
    rand_int = ri(0,500)
    X_train,X_test,y_train,y_test = train_test_split(final_data['tweet'],final_data['label'],test_size=0.5,random_state=rand_int)
    train_dataDF = pd.concat([X_train,y_train],axis=1)
    test_dataDF = pd.concat([X_test,y_test],axis=1)
    final_data = pd.concat([train_dataDF,test_dataDF],axis=0)
final_data.to_csv('romneyCleanedData_with_sampling_2.csv',sep=',',encoding='utf-8',header=['tweet','label'],index=False)
