## Twitter sentiment analysis

Date - 12th Oct 2020

- Data - 'twitterdata.txt' 
     - There are 30 thousands tweets with their corresponding sentiments.
     - Now our model will learn from those 30,000 tweets and predicts the sentiments for forthcoming tweets in future.
     - Model will be evaluated on the basis of below parameters
         - confusion matrix
         - classification matrix
     - model being used 
         - SVM
         - logistic regression
   
     

In [3]:
import numpy as np
import pandas as pd


In [45]:

from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

In [46]:
df = pd.read_csv('twitterdata.txt')

In [47]:
df.head()

Unnamed: 0,twitts,sentiment
0,@robbiebronniman Sounds like a great night.,1
1,Damn the person who stolde my wallet !!!!! Ma...,1
2,Greetings from the piano bench (photo) http:/...,1
3,@drewryanscott i love it!! i love you!! haha f...,1
4,"@kissthestars Pretty pretty pretty please, pak...",0


In [12]:
df.shape

(30000, 2)

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer


In [52]:
def execute_SVMmodel(x):

    X = df['twitts']
    y = df['sentiment']

    tfidf = TfidfVectorizer()
    X_tfidf = tfidf.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.33, random_state=42)


    clf_l = LinearSVC()
    clf_s = SVC()

    clf_l.fit(X_train,y_train)
    clf_s.fit(X_train,y_train)

    y_pred_l = clf_l.predict(X_test)
    y_pred_s = clf_s.predict(X_test)

    print(f'Accurancy for LinearSVC {accuracy_score(y_test,y_pred_l)} and SVC {accuracy_score(y_test,y_pred_s)}')

    print(f'linear - \n{confusion_matrix(y_test,y_pred_l)}')
    print (f'\n\nsvc - \n{confusion_matrix(y_test,y_pred_s)}')


    print(f'linear ------ \n {classification_report(y_test,y_pred_l)}')
    print (f'svc ------ \n {classification_report(y_test,y_pred_s)}')
    
    # we can pass the object for further exucution also
    return tfidf, clf_l, clf_s

In [50]:
execute_SVMmodel(df)



Accurancy for LinearSVC 0.752020202020202 and SVC 0.4918181818181818
linear - 
[[3599 1270]
 [1185 3846]]


svc - 
[[4869    0]
 [5031    0]]
linear ------ 
               precision    recall  f1-score   support

           0       0.75      0.74      0.75      4869
           1       0.75      0.76      0.76      5031

   micro avg       0.75      0.75      0.75      9900
   macro avg       0.75      0.75      0.75      9900
weighted avg       0.75      0.75      0.75      9900

svc ------ 
               precision    recall  f1-score   support

           0       0.49      1.00      0.66      4869
           1       0.00      0.00      0.00      5031

   micro avg       0.49      0.49      0.49      9900
   macro avg       0.25      0.50      0.33      9900
weighted avg       0.24      0.49      0.32      9900




### Data Cleaning 

- lowering all character
- expanding the contraction
- removing email,URL, HTML
- removing accented char
- lemmatization
- spelling correction


In [83]:
import re
from  bs4 import BeautifulSoup
import unicodedata
from spacy.lang.en.stop_words import STOP_WORDS
import spacy
nlp = spacy.load('en_core_web_sm')
from textblob import TextBlob

In [58]:
df['twitts'] = df['twitts'].apply(lambda x : x.lower())

In [60]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" }
def expand_contraction(x):
    for key,item in contraction_mapping.items():
        x=x.replace(key,item,-1)
    return x

In [61]:
df['twitts']=df['twitts'].apply(lambda x : expand_contraction(x))

In [64]:
df['twitts']=df['twitts'].apply(lambda x : (re.sub('\S+@\S+\.com','',x)))

In [65]:
regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
df['twitts']=df['twitts'].apply(lambda x : re.sub(regex, '', x))

In [66]:
# all special character , other than all alphanumeric
df['twitts']=df['twitts'].apply(lambda x : re.sub('[^\w ]+','',x))

In [67]:
df['twitts']=df['twitts'].apply(lambda x : re.sub('\s{2,}',' ',x))

In [70]:
df['twitts']=df['twitts'].apply(lambda x : BeautifulSoup(x,'html').get_text().strip())

In [72]:
def remove_accented_chars(x):
    x = unicodedata.normalize('NFKD', x).encode('ascii','ignore').decode('utf-8','ignore')
    return x
df['twitts']=df['twitts'].apply(lambda x : remove_accented_chars(x))

In [74]:
df['twitts']= df['twitts'].apply(lambda x : ' '.join([w for w in x.split() if w not in STOP_WORDS]))

In [79]:
def convert_to_rootword(x):
    doc = nlp(x)
    x_list=[]
    for token in doc:
        lemma = token.lemma_
        x_list.append(lemma)
    return ' '.join(x_list)

df['twitts'] = df['twitts'].apply(lambda x : convert_to_rootword(x))

In [81]:
text = ' '.join(df['twitts'])


### After cleaning the data , will train the model again. And will check if efficiency is increased or not.

In [132]:
execute_SVMmodel(df)



Accurancy for LinearSVC 0.7226262626262626 and SVC 0.4918181818181818
linear - 
[[3406 1463]
 [1283 3748]]


svc - 
[[4869    0]
 [5031    0]]
linear ------ 
               precision    recall  f1-score   support

           0       0.73      0.70      0.71      4869
           1       0.72      0.74      0.73      5031

   micro avg       0.72      0.72      0.72      9900
   macro avg       0.72      0.72      0.72      9900
weighted avg       0.72      0.72      0.72      9900

svc ------ 
               precision    recall  f1-score   support

           0       0.49      1.00      0.66      4869
           1       0.00      0.00      0.00      5031

   micro avg       0.49      0.49      0.49      9900
   macro avg       0.25      0.50      0.33      9900
weighted avg       0.24      0.49      0.32      9900



  'precision', 'predicted', average, warn_for)


(TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=1.0, max_features=None, min_df=1,
         ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
         stop_words=None, strip_accents=None, sublinear_tf=False,
         token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
         vocabulary=None),
 LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
      intercept_scaling=1, loss='squared_hinge', max_iter=1000,
      multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
      verbose=0),
 SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
   kernel='rbf', max_iter=-1, probability=False, random_state=None,
   shrinking=True, tol=0.001, verbose=False))