In [110]:
#General
import os
import re
import sklearn
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

# Preprocessing
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest,chi2


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [111]:

df = pd.read_csv('twitter_sentiment_data.csv',encoding='utf-8')
#df.drop(df.columns[1],axis=1,inplace=True)
df = df.drop_duplicates()  
df.head()



Unnamed: 0,sentiment,message,tweetid
0,-1,@tiniebeany climate change is an interesting h...,792927353886371840
1,1,RT @NatGeoChannel: Watch #BeforeTheFlood right...,793124211518832641
2,1,Fabulous! Leonardo #DiCaprio's film on #climat...,793124402388832256
3,1,RT @Mick_Fanning: Just watched this amazing do...,793124635873275904
4,2,"RT @cnalive: Pranita Biswasi, a Lutheran from ...",793125156185137153


In [0]:
#split x and y
x=df['message']
y=df['sentiment']

In [0]:
#stemming
def stemming(words):
    ps = nltk.stem.SnowballStemmer('english')
    #new = []
    stem_words = [ps.stem(x) for x in words]
    return stem_words

In [0]:
#preprocess


def preprocess(sentence):
    #sentence = contractions.fix(sentence)             # Replace contractions in string 
    #sentence = re.sub('@\w+',"",str(sentence))        # remove mentions
    #sentence = re.sub('#\w+',"",str(sentence))        # remove hashtags
    sentence = re.sub('http\S+',"",str(sentence))     # remove URLs
    #sentence = re.sub('[^\w\s]'," ",str(sentence))    # remove punc
    sentence = re.sub('[^a-zA-Z]'," ",str(sentence))  # remove everything other then text
    
    sents = word_tokenize(sentence)                   # Tokenization
    sents = stemming(sents)                           #stemming
   

    
    return sents



In [115]:
#Remove words smaller than 3 characters , preprocess and join words together

x=x.apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
x=x.apply(preprocess)
#x


for i in range(len(x)):
    x[i] = ' '.join(x[i])

x

0        tiniebeani climat chang interest hustl global ...
1        natgeochannel watch beforetheflood right here ...
2        fabul leonardo dicaprio s film climat chang br...
3        mick fan just watch this amaz documentari leon...
4        cnaliv pranita biswasi lutheran from odisha gi...
                               ...                        
43938    dear realdonaldtrump yeah right human mediat c...
43939    what will your respect parti prevent climat ch...
43940    mikkil poll show climat chang lowest global co...
43941    taehbeingextra still can q t believ this taehy...
43942    likeabat zachhal wealthi fossil fuel industri ...
Name: message, Length: 43943, dtype: object

In [0]:
#TF-IDF feature extraction

tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
# TF-IDF feature matrix
tfidf = tfidf_vectorizer.fit_transform(x)
chi_tfidf.shape

In [0]:
#Apply Kbest feature selection
from sklearn.feature_selection import SelectKBest,chi2
K = 900
vectorizer_chi2 = SelectKBest(chi2,k=K)
chi_tfidf = vectorizer_chi2.fit_transform(tfidf,y)
#chi_test_corpus_tf_idf = vectorizer_chi2.transform(test_corpus_tf_idf)


In [0]:
#train test split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
X_train,X_test,y_train,y_test=train_test_split(chi_tfidf,y,test_size=0.33)



In [69]:
#Logistic Regression
lreg = LogisticRegression(multi_class='auto')
lreg.fit(X_train, y_train) # training the model

prediction = lreg.predict(X_test) # predicting on the validation set
accuracy = metrics.accuracy_score(y_test, prediction)
print('Accuracy: {:.2f}'.format(accuracy))
#prediction_int = prediction[:,1] >= 0.3 # if prediction is greater than or equal to 0.3 than 1 else 0
#prediction_int = prediction_int.astype(np.int)

#f1_score(yvalid, prediction_int) # calculating f1 score

Accuracy: 0.67


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [109]:
#SVM
from sklearn import svm
from sklearn.svm import SVC

sv = SVC(decision_function_shape='ovo')
sv.fit(X_train, y_train)
prediction = sv.predict(X_test) # predicting on the test set
print(metrics.classification_report(y_test, prediction))

              precision    recall  f1-score   support

          -1       0.73      0.37      0.50      1277
           0       0.63      0.36      0.46      2598
           1       0.70      0.89      0.79      7578
           2       0.71      0.65      0.68      3049

    accuracy                           0.70     14502
   macro avg       0.69      0.57      0.60     14502
weighted avg       0.69      0.70      0.68     14502



In [82]:
#Complement NB
from sklearn import naive_bayes

cnb = naive_bayes.ComplementNB(norm='boolean')
cnb.fit(X_train, y_train)
prediction = cnb.predict(X_test) # predicting on the test set
accuracy = metrics.accuracy_score(y_test, prediction)
print('Accuracy: {:.2f}'.format(accuracy))

Accuracy: 0.60


In [103]:
#Multi-layer Perceptron
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(100,50),alpha=0.01, learning_rate_init=0.1,learning_rate='adaptive',random_state=1)
mlp.fit(X_train, y_train)
prediction = mlp.predict(X_test) # predicting on the test set
accuracy = metrics.accuracy_score(y_test, prediction)
print('Accuracy: {:.2f}'.format(accuracy))

Accuracy: 0.65


In [0]:
#MLP with Randomized Search CV
from sklearn.model_selection import RandomizedSearchCV

parameter_space = {
    'hidden_layer_sizes': [(100,),(150,100),(150,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.01,0.05],
    'learning_rate': ['constant','adaptive'],
}

clf = RandomizedSearchCV(mlp, parameter_space)
clf.fit(X_train, y_train)
print('Best parameters found:\n', clf.best_params_)


In [108]:
prediction = clf.predict(X_test) # predicting on the test set
#accuracy = metrics.accuracy_score(y_test, prediction)
#print('Accuracy: {:.2f}'.format(accuracy))
print(metrics.classification_report(y_test, prediction))

              precision    recall  f1-score   support

          -1       0.62      0.43      0.51      1277
           0       0.59      0.38      0.46      2598
           1       0.72      0.85      0.78      7578
           2       0.68      0.68      0.68      3049

    accuracy                           0.69     14502
   macro avg       0.65      0.58      0.61     14502
weighted avg       0.68      0.69      0.68     14502



In [121]:
#Bagged SVM
from sklearn.ensemble import BaggingClassifier
bsv = BaggingClassifier(base_estimator=SVC(), n_estimators=31, random_state=1, n_jobs=-1).fit(X_train, y_train)
prediction = bsv.predict(X_test) # predicting on the test set
print(metrics.classification_report(y_test, prediction))

              precision    recall  f1-score   support

          -1       0.73      0.36      0.48      1328
           0       0.61      0.39      0.48      2536
           1       0.71      0.89      0.79      7616
           2       0.73      0.66      0.69      3022

    accuracy                           0.70     14502
   macro avg       0.70      0.57      0.61     14502
weighted avg       0.70      0.70      0.69     14502

