### importing libraries

In [1]:
import pandas as pd
import numpy as np
import collections
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import models
from tensorflow.keras import layers

In [2]:
tweets_and_sentiment = pd.read_csv('sentiments_tweets.csv')

In [3]:
tweets_and_sentiment.head()

Unnamed: 0,text,afinn score,sentiment,word_count,sentiment_transformer,sentiment_tansformer_NP,transformer_score,sentiment_textblob
0,sacrificed everything protect corona patients ...,1.0,0.3818,7,0.99119,-1.0,-0.99119,0.0
1,alert shall new norms laws movement stricter o...,-3.0,-0.2732,8,0.921373,1.0,0.921373,0.136364
2,seven promises need make extra care senior cit...,1.0,0.5719,27,0.973535,-1.0,-0.973535,0.016667
3,please move unnecessarily help prevent,2.0,0.6249,5,0.988771,-1.0,-0.988771,-0.4
4,dear leader nation modi ji suggestion need fol...,3.0,0.6808,26,0.99452,-1.0,-0.99452,-0.133333


In [53]:
X = tweets_and_sentiment['text']
Y = tweets_and_sentiment['sentiment_textblob']

In [5]:
def linear_to_cat(int_):
    if int_ == 0:
        return 0
    if int_ < 0:
        return -1
    if int_ > 0:
        return 1

In [6]:
Y = Y.apply(linear_to_cat)

### importing libraries for machine learning model

In [7]:
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import RidgeClassifier,LogisticRegression
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report,f1_score,precision_score

In [8]:
models = {"Logistic Regression" : LogisticRegression(solver='newton-cg'),
         "SVC" : SVC(),
         "Multinomial NB" : MultinomialNB(), 
         "Bernoulli NB" : BernoulliNB(), 
         "Ridge Classifier" : RidgeClassifier(), 
         "AdaBoost" : AdaBoostClassifier(), 
         "Perceptron" : Perceptron(),
         "Passive-Aggresive" : PassiveAggressiveClassifier()}
#         "KNN" : KNeighborsClassifier(),
#         'Random forest' : RandomForestClassifier()}

In [62]:
tfid = TfidfVectorizer(max_features=2000, sublinear_tf =  True)

<IPython.core.display.Javascript object>

In [63]:
X = tfid.fit_transform(X).toarray()

In [64]:
X.shape,Y.shape

((109338, 2000), (109338,))

In [12]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

In [14]:
trained_model = {}

for i in models.keys():
    trained_model[i] = models[i].fit(x_train,y_train)

**----------------------------------------------------------------------------------------------------------------------------------------------------------**

**----------------------------------------------------------------------------------------------------------------------------------------------------------**

### saving the model for future use

In [25]:
import joblib

In [31]:
file_name = []
count=0
for i in trained_model.keys():
    file_name.append(i+'.sav')
    count = count+1
#     print(file_name[count-1])
#     print(trained_model[i])
    joblib.dump(trained_model[i], file_name[count-1])
    
# print(file_name)

In [32]:
file_name

['Logistic Regression.sav',
 'SVC.sav',
 'Multinomial NB.sav',
 'Bernoulli NB.sav',
 'Ridge Classifier.sav',
 'AdaBoost.sav',
 'Perceptron.sav',
 'Passive-Aggresive.sav']

In [33]:
lr_from_joblib = joblib.load('Logistic Regression.sav')

In [34]:
lr_from_joblib.score(x_train,y_train)

0.903544072253344

**----------------------------------------------------------------------------------------------------------------------------------------------------------**

**----------------------------------------------------------------------------------------------------------------------------------------------------------**

In [36]:
model_predict = {}
f1_score_ = {}
precision_ = {}

for i in trained_model.keys():
    model_predict[i] = trained_model[i].predict(x_test)
    f1_score_[i] = f1_score(y_test,model_predict[i],average='weighted')
    precision_[i] = precision_score(y_test,model_predict[i],average='weighted')
    print(f'--------------- {i}--------------')
    print(f'{classification_report(y_test,model_predict[i])}')

--------------- Logistic Regression--------------
              precision    recall  f1-score   support

          -1       0.91      0.73      0.81      4408
           0       0.85      0.98      0.91      7448
           1       0.93      0.90      0.92     10012

    accuracy                           0.89     21868
   macro avg       0.90      0.87      0.88     21868
weighted avg       0.90      0.89      0.89     21868

--------------- SVC--------------
              precision    recall  f1-score   support

          -1       0.91      0.74      0.82      4408
           0       0.85      0.99      0.92      7448
           1       0.94      0.90      0.92     10012

    accuracy                           0.90     21868
   macro avg       0.90      0.88      0.88     21868
weighted avg       0.90      0.90      0.90     21868

--------------- Multinomial NB--------------
              precision    recall  f1-score   support

          -1       0.91      0.41      0.56      4408


In [37]:
f1_score_

{'Logistic Regression': 0.892982128595712,
 'SVC': 0.8980213925245182,
 'Multinomial NB': 0.7251300126170527,
 'Bernoulli NB': 0.7940471116879367,
 'Ridge Classifier': 0.8786145193884284,
 'AdaBoost': 0.736261332438408,
 'Perceptron': 0.8094492213024578,
 'Passive-Aggresive': 0.885973954368919}

In [38]:
precision_

{'Logistic Regression': 0.8986290671689652,
 'SVC': 0.9033204445698527,
 'Multinomial NB': 0.7765671726185259,
 'Bernoulli NB': 0.7933830763197323,
 'Ridge Classifier': 0.8840584688381206,
 'AdaBoost': 0.7991581566653012,
 'Perceptron': 0.8234863465662463,
 'Passive-Aggresive': 0.8913152343773036}

### deep learning model

In [None]:
NB_WORDS = 100000  # Parameter indicating the number of words we'll put in the dictionary
VAL_SIZE = 10000  # Size of the validation set
NB_START_EPOCHS = 10  # Number of epochs we usually start to train with
BATCH_SIZE = 200  # Size of the batches used in the mini-batch gradient descent
MAX_LEN = 128  # Maximum number of words in a sequence

In [74]:
X_train, X_test, y_train, y_test = train_test_split(tweets_and_sentiment.text, tweets_and_sentiment.sentiment_textblob, test_size=0.1, random_state=37)

In [75]:
tk = Tokenizer(num_words=NB_WORDS,
filters='!"#$%&()*+,-./:;<=>?@[\]^_`{"}~\t\n',lower=True, split=" ")
tk.fit_on_texts(X_train)
X_train_seq = tk.texts_to_sequences(X_train)
X_test_seq = tk.texts_to_sequences(X_test)

In [76]:
X_train_seq_trunc = pad_sequences(X_train_seq, maxlen=MAX_LEN)
X_test_seq_trunc = pad_sequences(X_test_seq, maxlen=MAX_LEN)

In [77]:
X_train_emb, X_valid_emb, y_train_emb, y_valid_emb = train_test_split(X_train_seq_trunc, y_train, test_size=0.1, random_state=37)

### deep learning model

In [78]:
def deep_model(model, X_train, y_train, X_valid, y_valid):
    model.compile(optimizer='adam'                                    # squared_hinge,hinge
                  , loss='squared_hinge'
                  , metrics=['accuracy'])
    
    history = model.fit(X_train
                       , y_train
                       , epochs=11
                       , batch_size=100
                       , validation_data=(X_valid, y_valid)
                       )
    return history

In [94]:
model = models.Sequential()
model.add(layers.Embedding(NB_WORDS, 8,embeddings_initializer="glorot_uniform", input_length=MAX_LEN))
model.add(layers.LSTM(8,activation='tanh',return_sequences=True))
model.add(layers.Dropout(0.2))
model.add(layers.Flatten())
model.add(layers.Dense(1,activation='tanh'))
model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 70, 8)             800000    
_________________________________________________________________
lstm_9 (LSTM)                (None, 70, 8)             544       
_________________________________________________________________
dropout_6 (Dropout)          (None, 70, 8)             0         
_________________________________________________________________
flatten_5 (Flatten)          (None, 560)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 561       
Total params: 801,105
Trainable params: 801,105
Non-trainable params: 0
_________________________________________________________________


In [95]:
emb_history = deep_model(model, X_train_emb, y_train_emb, X_valid_emb, y_valid_emb)
emb_history.history['accuracy']

Epoch 1/11

KeyboardInterrupt: 

In [89]:
def eval_metric(history, metric_name):
    metric = history.history[metric_name]
    val_metric = history.history['val_' + metric_name]

    e = range(1, NB_START_EPOCHS + 1)

    plt.plot(e, metric, 'bo', label='Train ' + metric_name)
    plt.plot(e, val_metric, 'b', label='Validation ' + metric_name)
    plt.legend()
    plt.show()

In [None]:
eval_metric(emb_history, 'accuracy')

**converting text into tokens - tokenization**

In [None]:
tk = Tokenizer(num_words=NB_WORDS,
               filters='!"#$%&()*+,-./:;<=>?@[\]^_`{"}~\t\n',lower=True, split=" ")


In [None]:

eval_metric(emb_history, 'loss')

In [None]:
def test_model(model, X_train, y_train, X_test, y_test, epoch_stop):
    model.fit(X_train
              , y_train
              , epochs=epoch_stop
              , batch_size=BATCH_SIZE
              , verbose=0)
    results = model.evaluate(X_test, y_test)
    
    return results


In [None]:

emb_results = test_model(emb_model, X_train_seq_trunc, y_train, X_test_seq_trunc, y_test, 6)
print('/n')
print('Test accuracy of word embeddings model: {0:.2f}%'.format(emb_results[1]*100))