# Twitter Sentiment Analysis

 Data link: https://raw.githubusercontent.com/laxmimerit/twitter-data/master/twitt30k.csv

### Load the Library

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.feature_extraction.text import TfidfVectorizer

###  Load the dataset

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/laxmimerit/twitter-data/master/twitt30k.csv')
df

Unnamed: 0,twitts,sentiment
0,@robbiebronniman Sounds like a great night.,1
1,Damn the person who stolde my wallet !!!!! Ma...,1
2,Greetings from the piano bench (photo) http:/...,1
3,@drewryanscott i love it!! i love you!! haha f...,1
4,"@kissthestars Pretty pretty pretty please, pak...",0
...,...,...
29995,@Calumfan1 is it in any way related to photosh...,0
29996,@Swiz_NZ really? wow thats crap,0
29997,"At the 2010 lexus HS250h press event. Again, ...",0
29998,@karmicunderpath ooooh now there's a nice thou...,1


In [4]:
df['sentiment'].value_counts()

1    15000
0    15000
Name: sentiment, dtype: int64

### SVM Model And Data Preparation

In [5]:
def run_svm(df):
  #intalize the target and training variable
  X = df['twitts']
  y = df['sentiment']

  # load the TFIDF
  tfidf = TfidfVectorizer()
  X = tfidf.fit_transform(X)

  #Split the data
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0, stratify = y)

  print('shape of X: ', X.shape)

  #Train the SVC Model
  clf = LinearSVC()
  clf.fit(X_train, y_train)

  y_pred = clf.predict(X_test)

  print()
  print('Printing Report')
  print(classification_report(y_test, y_pred))

  return tfidf, clf

In [6]:
%%time
tfidf,clf =run_svm(df)

shape of X:  (30000, 40854)

Printing Report
              precision    recall  f1-score   support

           0       0.75      0.74      0.75      3000
           1       0.74      0.75      0.75      3000

    accuracy                           0.75      6000
   macro avg       0.75      0.75      0.75      6000
weighted avg       0.75      0.75      0.75      6000

CPU times: user 1.57 s, sys: 41.2 ms, total: 1.61 s
Wall time: 2.59 s


### Test The model

In [7]:
x = ['i am really happy. thanks a lot for coming with me']

In [8]:
clf.predict(tfidf.transform(x))

array([1])

## Data Cleaning and Retraining SVM

#### Use our preprocess python package

In [11]:
!pip install git+https://github.com/laxmimerit/preprocess_kgptalkie.git --upgrade --force-reinstall

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/laxmimerit/preprocess_kgptalkie.git
  Cloning https://github.com/laxmimerit/preprocess_kgptalkie.git to /tmp/pip-req-build-ekwh_2xt
  Running command git clone --filter=blob:none --quiet https://github.com/laxmimerit/preprocess_kgptalkie.git /tmp/pip-req-build-ekwh_2xt
  Resolved https://github.com/laxmimerit/preprocess_kgptalkie.git to commit 9ca68d37027af9f6a30d54640347ce3b2e2694b3
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: preprocess-kgptalkie
  Building wheel for preprocess-kgptalkie (setup.py) ... [?25l[?25hdone
  Created wheel for preprocess-kgptalkie: filename=preprocess_kgptalkie-0.1.3-py3-none-any.whl size=7620 sha256=8d12c1849fc51abf4c0ce8813c5b45aba7d9a6d41da64d754fc1f7d2d80832b0
  Stored in directory: /tmp/pip-ephem-wheel-cache-e6xdd983/wheels/5c/94/34/99d5ff65e88b8d9a6c5e8d8652f2311d87790a61

In [12]:
import preprocess_kgptalkie as pp

In [13]:
pp.__version__

'0.10.3'

In [14]:
df['twitts'] = df['twitts'].apply(lambda x: x.lower())

In [15]:
df['twitts'] = df['twitts'].apply(lambda x: pp.cont_exp(x))

In [16]:
df

Unnamed: 0,twitts,sentiment
0,@robbiebronniman sounds like a great night.,1
1,damn the person who stolde my wallet !!!!! ma...,1
2,greetings from the piano bench (photo) http:/...,1
3,@drewryanscott i love it!! i love you!! haha f...,1
4,"@kissthestars pretty pretty pretty please, pak...",0
...,...,...
29995,@calumfan1 is it in any way related to photosh...,0
29996,@swiz_nz really? wow thats crap,0
29997,"at the 2010 lexus hs250h press event. again, ...",0
29998,@karmicunderpath ooooh now there is a nice tho...,1


In [17]:
run_svm(df)

shape of X:  (30000, 40753)

Printing Report
              precision    recall  f1-score   support

           0       0.75      0.74      0.75      3000
           1       0.75      0.76      0.75      3000

    accuracy                           0.75      6000
   macro avg       0.75      0.75      0.75      6000
weighted avg       0.75      0.75      0.75      6000



(TfidfVectorizer(), LinearSVC())

In [18]:
# remove emails and urls

df['twitts'] = df['twitts'].apply(lambda x: pp.remove_emails(x))
df['twitts'] = df['twitts'].apply(lambda x: pp.remove_urls(x))
df['twitts'] = df['twitts'].apply(lambda x: pp.remove_rt(x))
df['twitts'] = df['twitts'].apply(lambda x: pp.remove_html_tags(x))
df['twitts'] = df['twitts'].apply(lambda x: pp.remove_special_chars(x))

  return BeautifulSoup(x, 'lxml').get_text().strip()


In [19]:
tfidf, clf = run_svm(df)

shape of X:  (30000, 42855)

Printing Report
              precision    recall  f1-score   support

           0       0.75      0.74      0.74      3000
           1       0.74      0.75      0.75      3000

    accuracy                           0.75      6000
   macro avg       0.75      0.75      0.75      6000
weighted avg       0.75      0.75      0.75      6000



In [20]:
clf.predict(tfidf.transform(x))

array([1])

### Fine tune the Model

In [22]:
def run_svm(df):
    X = df['twitts']
    y = df['sentiment']

    tfidf = TfidfVectorizer(norm = 'l1', ngram_range=(1,2), analyzer='word', max_features=5000)
    X = tfidf.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0, stratify = y)

    print('shape of X: ', X.shape)

    clf = LinearSVC()
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    print()
    print('Printing Report')
    print(classification_report(y_test, y_pred))

    return tfidf, clf

tfidf,clf=run_svm(df)

shape of X:  (30000, 5000)

Printing Report
              precision    recall  f1-score   support

           0       0.75      0.77      0.76      3000
           1       0.76      0.74      0.75      3000

    accuracy                           0.76      6000
   macro avg       0.76      0.76      0.76      6000
weighted avg       0.76      0.76      0.76      6000



### Saving and Loading ML MOdel

In [23]:
import pickle

In [24]:
pickle.dump(clf, open('clf.pkl', 'wb'))
pickle.dump(tfidf, open('tfidf.pkl', 'wb'))

In [25]:
del clf
del tfidf

In [26]:
clf = pickle.load(open('clf.pkl', 'rb'))
tfidf = pickle.load(open('tfidf.pkl', 'rb'))

In [27]:
clf

In [28]:
tfidf.vocabulary_

{'sounds': 3723,
 'like': 2368,
 'great': 1582,
 'night': 2844,
 'sounds like': 3725,
 'great night': 1584,
 'damn': 898,
 'the': 3958,
 'person': 3204,
 'who': 4697,
 'my': 2714,
 'may': 2533,
 'come': 804,
 'back': 407,
 'and': 209,
 'bite': 541,
 'you': 4899,
 'in': 1924,
 'ass': 361,
 'come back': 805,
 'back and': 408,
 'you in': 4936,
 'in the': 1954,
 'from': 1394,
 'photo': 3213,
 'from the': 1399,
 'love': 2453,
 'it': 2095,
 'haha': 1624,
 'forget': 1368,
 'should': 3558,
 'give': 1473,
 'me': 2541,
 'lie': 2362,
 'please': 3247,
 'would': 4835,
 'be': 442,
 'awesome': 398,
 'if': 1888,
 'did': 965,
 'love it': 2456,
 'love you': 2466,
 'give me': 1475,
 'it would': 2155,
 'would be': 4836,
 'if you': 1902,
 'you did': 4918,
 'pretty': 3284,
 'then': 4086,
 'site': 3599,
 'skin': 3606,
 'really': 3351,
 'upset': 4450,
 'big': 526,
 'pool': 3258,
 'or': 3106,
 'might': 2613,
 'able': 37,
 'to': 4191,
 'but': 615,
 'that': 3914,
 'is': 1986,
 'as': 343,
 'will': 4717,
 'get': 1

In [29]:
x

['i am really happy. thanks a lot for coming with me']

In [30]:
clf.predict(tfidf.transform(x))

array([1])