In [None]:
from sklearn.linear_model import LogisticRegressionCV,LogisticRegression
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn import metrics
import pandas as pd
import numpy as np 
import random
import time

In [None]:
data = pd.read_csv("labeled_data2.csv")
new_data = pd.read_csv("api_data2.csv")

In [None]:
data =  data.sample(frac=1)
y    =  data['airline_sentiment']
X    =  data
Xnew = new_data

In [None]:
# dropping columns not important
X = X.drop(columns=['Unnamed: 0', 'tweet_id','airline_sentiment','airline_sentiment_confidence','negativereason','negativereason_confidence','airline_sentiment_gold','negativereason_gold'])
X = X.drop(columns=['text','hashtags','user_timezone','tweet_coord','hashtags','full_text_processed','sents','words','clean_words'])
X = X.drop(columns=['user_location','tweet_location','tweet_created','user_acc_date','lang','name','airline','user_verified'])

Xnew = Xnew.drop(columns=['Unnamed: 0', 'tweet_id','airline_sentiment','airline_sentiment_confidence','negativereason','negativereason_confidence','airline_sentiment_gold','negativereason_gold'])
Xnew = Xnew.drop(columns=['text','hashtags','user_timezone','tweet_coord','hashtags','full_text_processed','sents','words','clean_words'])
Xnew = Xnew.drop(columns=['user_location','tweet_location','tweet_created','user_acc_date','lang','name','airline','user_verified'])


In [None]:
#split into train/test
index  =  round(.7*14640)
Xtrain =  X[1:index]
Xtest  =  X[index:14640]
ytrain =  y[1:index]
ytest  =  y[index:14640]


# Algorithim Comparisons 

In [None]:
# Multinomial naive bayes

t0= time.time()
mnb             =  MultinomialNB()

mnb.fit(Xtrain,ytrain)

t1 = time.time() - t0
print("Time elapsed: ", t1)

mnb.score(Xtrain,ytrain)


In [None]:
# RANDOM FOREST 
t0= time.time()
rf            =  RandomForestClassifier(n_estimators=500, random_state=0)
rf.fit(Xtrain,ytrain)

t1 = time.time() - t0
print("Time elapsed: ", t1)



In [None]:
# Logistic Regression 
t0= time.time()
log_model = LogisticRegression(multi_class='multinomial')
log_model.fit(Xtrain,ytrain)
log_model.score(Xtest,ytest)
t1 = time.time() - t0
print("Time elapsed: ", t1)
# So far Logistic model has performed the best on the dataset,
# but the model can be improved upon by using regularization 

In [None]:
# Elastic Net logistic Regression
# Due to sparsity of bag of words matrix, regularization will probably improve model by a lot 
t0= time.time()
elas_log     =  LogisticRegressionCV(penalty='elasticnet',cv=10,l1_ratios=[0.2,0.5,0.7],max_iter=5000,solver='saga')
elas_log.fit(Xtrain,ytrain)
elas_log.score(Xtest,ytest)
t1 = time.time() - t0
print("Time elapsed: ", t1)

In [None]:
predictions=elas_log.predict(Xnew)
new_data['predicted_sentiment']=predictions

In [None]:
#new_data.to_csv("predictions_api_full.csv",sep=",",encoding='utf-8')

In [None]:
print("Naive Bayes Train Accuracy:",mnb.score(Xtrain,ytrain))
print("Random Forest Train Accuracy:",rf.score(Xtrain,ytrain))
print("Logistic Regression Train Accuracy:",log_model.score(Xtrain,ytrain))
print("Elastic Net Train Accuracy:",elas_log.score(Xtrain,ytrain))

print("Naive Bayes Test Accuracy:",mnb.score(Xtest,ytest))
print("Random Forest Test Accuracy:",rf.score(Xtest,ytest))
print("Logistic Regression Test Accuracy:",log_model.score(Xtest,ytest))
print("Elastic Net Test Accuracy:",elas_log.score(Xtest,ytest))


# Elastic Net Coefficients

In [None]:
neg_y=[]
for label in ytrain:
    if label=='negative':
        neg_y.append(1)
    else:
        neg_y.append(0)
        
neg_log_model = LogisticRegressionCV(penalty='elasticnet',cv=10,l1_ratios=[0.2,0.5,0.7],max_iter=5000,solver='saga')
neg_log_model.fit(Xtrain,neg_y)
coefs = neg_log_model.coef_

#creating matrix of values 
coefs = log_model.coef_
words_list=list(Xtrain.columns)
neg_matrix=pd.DataFrame()
neg_matrix['variable']=words_list
neg_matrix['coef_value']=coefs[0]
neg_matrix=neg_matrix.sort_values(by='coef_value',ascending=False)


In [None]:
pos_y=[]
for label in ytrain:
    if label=='positive':
        pos_y.append(1)
    else:
        pos_y.append(0)
        
pos_log_model = LogisticRegressionCV(penalty='elasticnet',cv=10,l1_ratios=[0.2,0.5,0.7],max_iter=5000,solver='saga')
pos_log_model.fit(Xtrain,pos_y)

#creating matrix of values 
coefs        =  pos_log_model.coef_
words_list   =  list(Xtrain.columns)
pos_matrix   =  pd.DataFrame()

pos_matrix['variable']    =  words_list
pos_matrix['coef_value']  =  coefs[0]

pos_matrix   =  pos_matrix.sort_values(by='coef_value',ascending=False)


In [None]:
pos_matrix.head(15)

In [None]:
neg_matrix.head(15)
#pos_matrix.to_csv("pos_coefs.csv",sep=",",encoding='utf-8')
#neg_matrix.to_csv("neg_coefs.csv",sep=",",encoding='utf-8')