# Imports 

In [1]:
import numpy as mp
import pandas as pd 
from textblob import TextBlob
import string 
import re 
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [2]:
trump = pd.read_csv('alltweets.csv')

In [3]:
trump.head()

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str
0,Twitter for iPhone,I am in Japan at the G-20 representing our Cou...,06-28-2019 09:26:34,20416,102303,False,1144537564944228352
1,Twitter for iPhone,The Stock Market went up massively from the da...,06-28-2019 09:12:18,15350,78061,False,1144533973428842496
2,Twitter for iPhone,All Democrats just raised their hands for givi...,06-28-2019 01:37:04,59615,233426,False,1144419410729242625
3,Twitter for iPhone,Great to be back in Japan for the #G20OsakaSum...,06-28-2019 00:46:37,12999,63019,False,1144406713165963264
4,Twitter for iPhone,Bipartisan Humanitarian Aid Bill for the South...,06-27-2019 22:22:39,22269,94193,False,1144370485783867392


# DataFrame cleaning

In [4]:
trump.drop(['id_str','created_at','source'],axis=1,inplace=True)

In [5]:
#removing RT
mask = trump['is_retweet'] == True
trump = trump[~mask]

In [6]:
#removing links from tweets
trump['cleanLinks'] = trump['text'].apply(lambda x: re.split('https:\/\/.*', str(x))[0])

In [7]:
trump.drop(['is_retweet','text'],axis=1,inplace=True)

# Preforming sentiment analysis using textblob

In [8]:
trump['sentiment']=trump['cleanLinks'].apply(lambda t:TextBlob(t).polarity)

In [9]:
trump.head()

Unnamed: 0,retweet_count,favorite_count,cleanLinks,sentiment
0,20416,102303,I am in Japan at the G-20 representing our Cou...,-0.295
1,15350,78061,The Stock Market went up massively from the da...,0.0
2,59615,233426,All Democrats just raised their hands for givi...,-0.036458
3,12999,63019,Great to be back in Japan for the #G20OsakaSummit,0.4
4,22269,94193,Bipartisan Humanitarian Aid Bill for the South...,0.125


In [10]:
#reseting the index for future operation
trump.reset_index(inplace=True)

# Feature Engineering

In [11]:
#defining a rank function to try and divide the sentiments to good and bad.

def rank(data):
    if data >= -1.0 and data < 0.0:
        data = 'Negtive'
    elif data == 0.0:
        data = 'Natural'
    elif data > 0.0 and data <= 1.0:
        data = 'Positive'
    return data

trump['rank'] = trump['sentiment'].apply(rank)
    

In [12]:
trump.head()

Unnamed: 0,index,retweet_count,favorite_count,cleanLinks,sentiment,rank
0,0,20416,102303,I am in Japan at the G-20 representing our Cou...,-0.295,Negtive
1,1,15350,78061,The Stock Market went up massively from the da...,0.0,Natural
2,2,59615,233426,All Democrats just raised their hands for givi...,-0.036458,Negtive
3,3,12999,63019,Great to be back in Japan for the #G20OsakaSummit,0.4,Positive
4,4,22269,94193,Bipartisan Humanitarian Aid Bill for the South...,0.125,Positive


# Text Preprocessing and creating BOW


In [13]:
trump.shape

(10921, 6)

In [14]:
corpus =  []
for i in range(0,10921):
    review = re.sub('[^a-zA-Z]',' ',trump['cleanLinks'][i])
    review = review.lower().split()
    #ps = PorterStemmer()
    #review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
    review = " ".join(review)
    corpus.append(review)

In [16]:
cv = CountVectorizer()
bow = cv.fit_transform(corpus).toarray()

# Data prep


In [17]:
X = bow
y = trump['rank']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Setting the Logistic regression



In [18]:
log = LogisticRegression()
log.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

# Checking the results


In [19]:
pred = log.predict(X_test)

print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))
print('\n')
print (accuracy_score(y_test,pred))


[[ 632   47   70]
 [  61  543  124]
 [  76   99 1625]]


              precision    recall  f1-score   support

     Natural       0.82      0.84      0.83       749
     Negtive       0.79      0.75      0.77       728
    Positive       0.89      0.90      0.90      1800

   micro avg       0.85      0.85      0.85      3277
   macro avg       0.83      0.83      0.83      3277
weighted avg       0.85      0.85      0.85      3277



0.8544400366188587
