In [4]:
#Importing libraries

import re
import numpy as np
import pandas as pd
import nltk
import ast     
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer 
from nltk.tokenize import WordPunctTokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
#Importing dataset 
raw_data = pd.read_csv('twitter_data.tsv',delimiter= '\t', quoting=3, names=['tweet','device','location','candidate'])


In [6]:
cand_twt = pd.read_csv('Data/Tweets_by_candidates.tsv',delimiter= '\t', quoting=3, names=['tweet','candidate'])

## Data Processing

In [None]:
## Streamed Tweets
#Dropping Duplicates

raw_data = raw_data.drop_duplicates().reset_index(drop=True)

#Creating columns for Sentiment Scores
raw_data['Sentiment_Compund']=np.nan
raw_data['Sentiment_Neg']=np.nan
raw_data['Sentiment_Neu']=np.nan
raw_data['Sentiment_Pos']=np.nan

raw_data['ht'] = np.nan
raw_data['mention'] = np.nan

In [None]:
## Candidates tweets
#Dropping Duplicates
cand_twt = cand_twt.drop_duplicates().reset_index(drop=True)

#Creating columns for Sentiment Scores
cand_twt['Sentiment_Compund']=np.nan
cand_twt['Sentiment_Neg']=np.nan
cand_twt['Sentiment_Neu']=np.nan
cand_twt['Sentiment_Pos']=np.nan

cand_twt['ht'] = np.nan
cand_twt['mention'] = np.nan

In [None]:
## Extracting Hashtags and Mentions from Streamed tweets

for i in range(len(raw_data)):
    hashtag = re.findall(r'#(\w+)', raw_data['tweet'][i])
    if len(hashtag) > 0 :
        raw_data['ht'][i] = hashtag
    mention = re.findall(r'@(\w+)', raw_data['tweet'][i])
    if len(hashtag) > 0 :
        raw_data['mention'][i] = mention
 
raw_data = raw_data.fillna("None")    

In [None]:
## Extracting Hashtags and Mentions from Candidates Tweets

for i in range(len(cand_twt)):
    hashtag = re.findall(r'#(\w+)', cand_twt['tweet'][i])
    if len(hashtag) > 0 :
        cand_twt['ht'][i] = hashtag
    mention = re.findall(r'@(\w+)', cand_twt['tweet'][i])
    if len(hashtag) > 0 :
        cand_twt['mention'][i] = mention
 
cand_twt = cand_twt.fillna("None")   

In [None]:
## Extracting State name from location 

#Reading state files
states = pd.read_csv('states.tsv',delimiter= '\t', 
                       quoting=3, 
                       names=['State','Abbrv.','PS']) 
states = states.drop([0]).reset_index(drop=True)

#Creating dictionary for states
states_dict = dict()
for i in range(len(states)):
    states_dict[states['PS'][i]] = states['State'][i] 

raw_data['state']= np.nan

for i in range(len(raw_data)):
    if( (i+1)%10000 == 0 ):
        print("%d of %d Tweets has been Processed" % ( i+1, len(raw_data)))  
    loc = raw_data['location'][i].lower()
    for k, v in states_dict.items():
        flag = False
        if (k in raw_data.iloc[i ,2]):
            raw_data['state'][i]= k
            flag = True
        if ( not flag and v in raw_data.iloc[i ,2]):
            raw_data['state'][i]= k

#Checking observations with no state information
raw_data['state'].isnull().sum() 

## Sentiment analysis using VADER

In [None]:
## Analyzing Sentiment using Vader
analyser = SentimentIntensityAnalyzer()

# For streamed tweets 
for i in range(len(raw_data)):
    if( (i+1)%10000 == 0 ):
        print("%d of %d Tweets has been analyzed" % ( i+1, len(raw_data)))   
    score = analyser.polarity_scores(raw_data['tweet'][i])
    raw_data['Sentiment_Compund'][i]=score['compound']
    raw_data['Sentiment_Neg'][i]=score['neg']
    raw_data['Sentiment_Neu'][i]=score['neu']
    raw_data['Sentiment_Pos'][i]=score['pos']

In [None]:
# For tweets by candidates

for i in range(len(cand_twt)):
    if( (i+1)%1000 == 0 ):
        print("%d of %d Tweets has been analyzed" % ( i+1, len(cand_twt)))   
    score = analyser.polarity_scores(cand_twt['tweet'][i])
    cand_twt['Sentiment_Compund'][i]=score['compound']
    cand_twt['Sentiment_Neg'][i]=score['neg']
    cand_twt['Sentiment_Neu'][i]=score['neu']
    cand_twt['Sentiment_Pos'][i]=score['pos']

### Writing data to file for backup

In [None]:
 for i in range(len(raw_data)):
    f = open("Tweets.tsv", "a")
    f.write(raw_data['device'][i]+"\t"+raw_data['candidate'][i]+"\t"+
            str(raw_data['Sentiment_Compund'][i])+"\t"+
            str(raw_data['Sentiment_Neg'][i])+"\t"+str(raw_data['Sentiment_Neu'][i])+"\t"+
            str(raw_data['Sentiment_Pos'][i])+"\t"+
            str(raw_data['ht'][i])+"\t"+
            str(raw_data['mention'][i])+"\t"+
            str(raw_data['state'][i])+"\n")
    f.close()    

for i in range(len(cand_twt)):
    f = open("Cand_tweets.tsv", "a")
    f.write(cand_twt['candidate'][i]+"\t"+
            str(cand_twt['Sentiment_Compund'][i])+"\t"+
            str(cand_twt['Sentiment_Neg'][i])+"\t"+str(cand_twt['Sentiment_Neu'][i])+"\t"+
            str(cand_twt['Sentiment_Pos'][i])+"\t"+
            str(cand_twt['ht'][i])+"\t"+
            str(cand_twt['mention'][i])+"\n")
    f.close()        

### Reading saved data

In [None]:
tweets_in = pd.read_csv('Tweets.tsv',delimiter= '\t', 
                       quoting=3, 
                       names=['device','candidate','Sentiment_Compund',
                              'Sentiment_Neg','Sentiment_Neu','Sentiment_Pos',
                              'ht','mention'])

In [None]:
cand_tweets_in = pd.read_csv('Cand_tweets.tsv',delimiter= '\t', 
                       quoting=3, 
                       names=['candidate','Sentiment_Compund',
                              'Sentiment_Neg','Sentiment_Neu','Sentiment_Pos',
                              'ht','mention']

In [None]:
## Checking for most used hashtags in the tweets
dict_one = dict()    

for i in range(len(tweets_in)):
    if tweets_in['ht'][i] != 'None':
        res = ast.literal_eval(tweets_in['ht'][i])
        for j in res:
            word = j.lower()
            dict_one[word] = dict_one.get(word,0)+1

In [None]:
## Checking for most used mention in the tweets
dict_two = dict()    

for i in range(len(tweets_in)):
    if tweets_in['mention'][i] != 'None':
        res = ast.literal_eval(tweets_in['mention'][i])
        if len(res) > 0:
            for j in res:
                word = j.lower()
                dict_two[word] = dict_two.get(word,0)+1

# Sentiment Analysis (Using Bag of Words Model)

In [None]:
# Reading the data  
cols = ['sentiment','id','date','query_string','user','text']
sentiment140_df = pd.read_csv("training.1600000.processed.noemoticon.csv",header=None, names=cols,encoding='iso-8859-1')

sentiment140_df.head()

sentiment140_df.sentiment.value_counts()
sentiment140_df.drop(['id','date','query_string','user'],axis=1,inplace=True)

sentiment140_df[sentiment140_df.sentiment == 0].head(10)

In [None]:
from pprint import pprint
data_dict = {
    'sentiment':{
        'type':train.sentiment.dtype,
        'description':'sentiment class - 0:negative, 1:positive'
    },
    'text':{
        'type':train.text.dtype,
        'description':'tweet text'
    },
    'pre_clean_len':{
        'type':train.pre_clean_len.dtype,
        'description':'Length of the tweet before cleaning'
    },
    'dataset_shape':train.shape
}
pprint(data_dict)

## Data Cleaning

In [None]:
from nltk.tokenize import WordPunctTokenizer

### Cleaning and parsing tweet

In [None]:
tok = WordPunctTokenizer()
mention_reg = r'@[A-Za-z0-9]+'
url_reg = r'https?://[A-Za-z0-9./]+'
combined_reg = r'|'.join((mention_reg, url_reg))

In [None]:
def tweet_cleaner(text):
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    stripped = re.sub(combined_reg, '', souped)
    try:
        clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        clean = stripped
    letters_only = re.sub("[^a-zA-Z]", " ", clean)
    lower_case = letters_only.lower()
    words = tok.tokenize(lower_case)
    return (" ".join(words)).strip()

In [None]:
#
clean_tweet_texts = []
for i in range(len(train)):
    if( (i+1)%10000 == 0 ):
        print("Tweets %d of %d has been processed" % ( i+1,len(train)))                                                                   
    clean_tweet_texts.append(tweet_cleaner(train['text'][i]))

s140_df = pd.DataFrame(clean_tweet_texts,columns=['text'])
s140_df['target'] = train.sentiment
s140_df.head()
s140_df.to_csv('clean_sentiment140.csv',encoding='utf-8')

### Stemmming

In [None]:
corpus = []
for i in range(nums[4]):
    if( (i+1)%10000 == 0 ):
        print("Tweets %d of %d has been processed" % ( i+1, nums[4])) 
    tweet140 = str(s140_df['text'][i])
    tweet140 = tweet140.lower()
    tweet140 = tweet140.split()
    ps = PorterStemmer()
    tweet140 = [ps.stem(word) for word in tweet140 if not word in set(stopwords.words('english'))]
    tweet140 = ' '.join(tweet140)
    corpus.append(tweet140)

train140 = pd.DataFrame(corpus,columns=['text'])
train140['sentiment'] = my_df.target
train140.head()
train140.to_csv('final_train.csv',encoding='utf-8')

### Model Training (Using Counter Vectorization)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
train140.dropna(inplace = True)

cv = CountVectorizer(max_features = 1800)
X = cv.fit_transform(train140.text).toarray()
y = train140.iloc[:, 1].values

In [None]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [None]:
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

In [None]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [None]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [None]:
print(cm)

### Model Training (Using Hashing Vectorizer)

In [None]:
X = train_data.iloc[:, 0].values
y = train_data.iloc[:, 1].values

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [None]:
# create the transform
vectorizer = HashingVectorizer(n_features=200)
# encode document
vector = vectorizer.fit_transform(X_train)

# summarize encoded vector
print(vector.shape)

In [None]:
classifier_hash = GaussianNB()
classifier_hash.fit(vector.toarray(), y_train)

In [None]:
vector_test = vectorizer.transform(X_test)
print(vector_test.shape)

In [None]:
y_pred = classifier_hash.predict(vector_test.toarray())

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

### Model Training (Using Hashing Tfidf)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer_tfidf = TfidfVectorizer(max_features=500, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))

# tokenize and build vocab
vector = vectorizer_tfidf.fit_transform(X_train)

In [None]:
type(vector)
vector.shape

In [None]:
classifier_tfidf = GaussianNB()
classifier_tfidf.fit(vector.toarray(), y_train)

In [None]:
vector_test = vectorizer_tfidf.transform(X_test)
print(vector_test.shape)

In [None]:
y_pred = classifier_hash.predict(vector_test.toarray())

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

### Creating word cloud

In [None]:
from wordcloud import WordCloud
from PIL import Image
wc = WordCloud(background_color="white", max_words=3000)

In [None]:
test_data =  pd.read_csv('Tweets_by_candidates.tsv',delimiter= '\t', quoting=3, names=['tweet','candidate'])


joe_words = test_data[test_data['candidate']== 'Joe_Biden']
warren_words = test_data[test_data['candidate']== 'Elizabeth_Warren']
sanders_words = test_data[test_data['candidate']== 'Bernie_Sanders']
booker_words = test_data[test_data['candidate']== 'Cory_Booker']
yang_words = test_data[test_data['candidate']== 'Andrew_Yang']
pete_words = test_data[test_data['candidate']== 'Pete_Buttigieg']

In [None]:
corpus = []

for line in joe_words['tweet']:
    words = line.split()
    for word in words:
        corpus.append(word)

text =' '.join(corpus)

text = re.sub(r'@[A-Za-z0-9]+','',text)
text = re.sub('https?://[A-Za-z0-9./]+','',text)
text = re.sub('rt','',text)
text = re.sub('RT','',text)
text = re.sub('amp','',text)
text = re.sub('will','',text)
text = re.sub('new','',text)
text = re.sub('today','',text)

wc.generate(text)
wc.to_file('wc_joe.jpeg')