In [1]:
import re
import pandas as pd
import numpy as np
import warnings
from bs4 import BeautifulSoup
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer
from wordsegment import segment
from html import unescape
import itertools
import random
warnings.filterwarnings('ignore')

In [2]:
xl_file = pd.ExcelFile("training-Obama-Romney-tweets.xlsx", encoding='utf-8')

In [4]:
dfs = {sheet_name: xl_file.parse('Obama',encoding='utf-8') 
          for sheet_name in xl_file.sheet_names}
obamaData=dfs.get('Obama')

In [5]:
def clearColumns(dataDF):
    dataDF=dataDF.drop(dataDF.columns[[0, 1, 2, 5]], axis=1)
    dataDF.columns = ['tweet', 'label']
    dataDF = dataDF.ix[1:]
    print(dataDF)
    print(dataDF['label'].unique())
    dataDF=dataDF[dataDF['label'].isin([0,1,-1,u'-1',u'1',u'0'])]
    return dataDF

In [15]:
obamaDataDF=clearColumns(obamaData)

                                                  tweet label
1     Kirkpatrick, who wore a baseball cap embroider...     0
2     Question: If <e>Romney</e> and <e>Obama</e> ha...     2
3     #<e>obama</e> debates that Cracker Ass Cracker...     1
4     RT @davewiner Slate: Blame <e>Obama</e> for fo...     2
5     @Hollivan @hereistheanswer  Youre missing the ...     0
6     <e>Mitt Romney</e> made all of his money himse...     2
7     I was raised as a Democrat  left the party yea...    -1
8     The <e>Obama camp</e> can't afford to lower ex...     0
9     Tonight's debate has that "Game 7" feel! This ...     2
10    <e>Obama</e> pot <a>policy</a> disappointing -...    -1
11    Not all of Hollywood has his back! RT @RedAler...    -1
12    @hblodget i'd be grateful for scoop from you g...     2
13    <e>Obama</e> must, to a degree, hit <e>MR</e> ...     2
14    <e>Obama</e>'s Expedient Speak fair in order t...     0
15    I had a dream that i was smoking with <e>Obama...     0
16    Th

In [16]:
# stopwords=['i','me','my','myself','we','our','ours','ourselves','you','your','yours','yourself','yourselves','he','him','his',
#            'himself','she','her','hers','herself','it','its','itself','they','them','their','theirs','themselves','what',
#            'which','who','whom','this','that','these','those','am','is','are','was','were','be','been','being','have','has',
#            'had','having','do','does','did','doing','a','an','the','and','but','if','or','because','as','until','while','of',
#            'at','by','for','with','about','against','between','into','through','during','before','after','above','below','to',
#            'from','up','down','in','out','on','off','over','under','again','further','then','once','here','there','when',
#            'where','why','how','all','any','both','each','few','more','most','other','some','such','only',
#            'own','same','so','than','too','very','can','will','just','should','now','im']

In [17]:
def cleanURLS(tweet):
    tweet=re.sub('http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', '', tweet)
    return tweet
    
def getText_fromHTML(tweet):
    tweet = BeautifulSoup(tweet)
    tweet = tweet.get_text()
    return tweet

def removeAppostophes(tweet):
    appostophes = {"s": "is", "re": "are", "em": "them", "ll": "will", "t": "it", "m": "am", "ve": "have", "d": "did"}
    words = []
    [words.append(terms) for word in tweet.split() for terms in [appostophes[term] if term in appostophes else term for
                                                                 term in word.split("'")]]
    tweet = " ".join(words)
    return tweet

def splitAttachedwords(tweet):
    tweet = " ".join(segment(tweet))
    return tweet

def lemmatisation(tweet):
    return " ".join(WordNetLemmatizer().lemmatize(word) for word in tweet.split())

def stemming(tweet):
    return " ".join(PorterStemmer().stem(word) for word in tweet.split())

In [18]:
def cleanTweets(dataDF):
    rowsToBeRemoved=[]
    TAG_RE = re.compile(r'<[^>]+>')
    for index, row in dataDF.iterrows():
        tweet=row['tweet']
        if isinstance(row['label'], str) == True:
            row['label']=int(row['label'])
        output=""      
        if isinstance(tweet, str) == False:
            rowsToBeRemoved.append(index)
            continue
        tweet=tweet.lower()
        
        # remove the urls from tweet
        tweet = cleanURLS(tweet)

        # remove HTML tags
        tweet = getText_fromHTML(tweet)
        
        # remove appostophes
        tweet = removeAppostophes(tweet)
        
        # split attached words ex: goodboy = good boy
        tweet = splitAttachedwords(tweet)
        
        for word in tweet.split(" "):
            # removing usernames
            if word.strip().startswith('@'):
                word=""
            # removing hashtags
            if word.strip().startswith('#'):
                word=word[1:]
            # strip punctuation
            word=re.sub(r'[^\w\s]','',word)
            if word != "" and word.isalpha():
                output+=" "+word
        if len(output)<1:
            rowsToBeRemoved.append(index)
        output = lemmatisation(output)
        output = stemming(output)   
        row['tweet']=output.strip()
    return dataDF.dropna()

In [19]:
obamaCleanedData=cleanTweets(obamaDataDF.copy(deep=True))

In [20]:
# Shuffling the cleaned data
from sklearn.cross_validation import train_test_split
from numpy.random import random_integers as ri

final_data = obamaCleanedData.copy(deep=True)
for i in range(0,500):
    rand_int = ri(0,500)
    X_train,X_test,y_train,y_test = train_test_split(final_data['tweet'],final_data['label'],test_size=0.5,random_state=rand_int)
    train_dataDF = pd.concat([X_train,y_train],axis=1)
    test_dataDF = pd.concat([X_test,y_test],axis=1)
    final_data = pd.concat([train_dataDF,test_dataDF],axis=0)
final_data.to_csv('obamaCleanedData.csv',sep=',',encoding='utf-8',header=['tweet','label'],index=False)

In [21]:
del obamaData,obamaDataDF,obamaCleanedData