# Twitter Sentiment Analysis

In [1]:
# Importing libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv("train_E6oV3lV.csv")
test = pd.read_csv("test_tweets_anuFYb8.csv")

In [3]:
for feature in train.columns:
    print("unique values in {} are {}".format(feature,train[feature].nunique()))

unique values in id are 31962
unique values in label are 2
unique values in tweet are 29530


In [4]:
train.shape , test.shape

((31962, 3), (17197, 2))

In [5]:
submission = pd.DataFrame()
submission["id"] = test["id"]

In [6]:
train.drop(columns = ["id"] , axis =1 , inplace = True)
test.drop(columns = ["id"] , axis =1 , inplace = True)

### Data cleaning by removing the unnecessary words and symbol in tweet feature

In [7]:
def tweet_cleaned(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z]'," ",text)
    text = re.sub(r'\\n'," ",text)
    text = re.sub(r'\W'," ",text)
    text = re.sub(r'https\s+|www.\s+'," ",text)
    text = re.sub(r'http\s+|www.\s+'," ",text)
    text = re.sub(r'\s+[a-zA-Z]\s+'," ",text)
    text = re.sub(r'\^[a-zA-Z]\s+'," ",text)
    text = re.sub(r"won\'t", "will not",text)
    text = re.sub(r'can\'t', "can not",text)
    text = re.sub(r'don\'t', "do not",text)
    text = re.sub(r'dont', "do not",text)
    text = re.sub(r'n\’t', " not",text)
    text = re.sub(r'n\'t', " not",text)
    text = re.sub(r'\'re', " are",text)
    text = re.sub(r'\'s', " is",text)
    text = re.sub(r'\’d', " would",text)
    text = re.sub(r'\d', " would",text)
    text = re.sub(r'\'ll', " will",text)
    text = re.sub(r'\'t', " not",text)                  
    text = re.sub(r'\'ve', " have",text)
    text = re.sub(r'\'m', " am",text)
    text = re.sub(r'\n', "",text)
    text = re.sub(r'\r', "",text)
    text = re.sub(r'[0-9]', "digit",text)
    text = re.sub(r"\'", "",text)
    text = re.sub(r"\"", "",text)
    text = re.sub(r'[?|!|\'|"|#]',r'',text)
    text = re.sub(r'[.|,|)|(|\|/]',r' ',text)    
    text = re.sub(r'\s+', ' ',text)
    return text

train["tweet_cleaned"] = train["tweet"].apply(tweet_cleaned)
test["tweet_cleaned"] = test["tweet"].apply(tweet_cleaned)

In [8]:
train.head()

Unnamed: 0,label,tweet,tweet_cleaned
0,0,@user when a father is dysfunctional and is s...,user when father is dysfunctional and is so s...
1,0,@user @user thanks for #lyft credit i can't us...,user user thanks for lyft credit can use caus...
2,0,bihday your majesty,bihday your majesty
3,0,#model i love u take with u all the time in ...,model love take with all the time in ur
4,0,factsguide: society now #motivation,factsguide society now motivation


In [9]:
train.drop(columns = ["tweet"] , axis =1 , inplace = True)
test.drop(columns = ["tweet"] , axis =1 , inplace = True)

In [10]:
X = train.drop(labels = ["label"],axis =1)
y = train["label"].values

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold,GridSearchCV

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report , confusion_matrix , accuracy_score

from xgboost import XGBClassifier
from sklearn.ensemble import BaggingClassifier,GradientBoostingClassifier , RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from lightgbm import LGBMRegressor

In [12]:
X_train , X_cv , y_train , y_cv = train_test_split(X,y,test_size=0.25,random_state=42)

In [13]:
tf1 = TfidfVectorizer(ngram_range=(1,2),stop_words = 'english' , token_pattern = r'\w{3,}' , min_df = 3)

#tf1 = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}', ngram_range=(1, 4), use_idf=1,smooth_idf=1,sublinear_tf=1, stop_words = 'english')
tfidf_train_tweets = tf1.fit_transform(X_train["tweet_cleaned"])
tfidf_cv_tweets = tf1.transform(X_cv["tweet_cleaned"])
#tfidf_cv_tweets = pd.DataFrame(data = tfidf_cv_tweets.toarray() , columns = tf1.get_feature_names())

In [14]:
xgb = XGBClassifier(learning_rate=0.1, 
                   gamma=100, 
                   max_depth=25,  
                   min_child_weight=1, 
                   max_delta_step=0, 
                   subsample=0.75,  
                   colsample_bylevel=0.95,  
                   colsample_bytree=0.70,  
                   reg_lambda=1)

xgb.fit(tfidf_train_tweets, y_train)
y_pred1 = xgb.predict(tfidf_cv_tweets)
print('Confusion matrix:', confusion_matrix(y_cv, y_pred1))
print('Classification_report:', classification_report(y_cv, y_pred1))
print('Accuracy_score:', accuracy_score(y_cv, y_pred1))

Confusion matrix: [[7432    0]
 [ 551    8]]
Classification_report:               precision    recall  f1-score   support

           0       0.93      1.00      0.96      7432
           1       1.00      0.01      0.03       559

    accuracy                           0.93      7991
   macro avg       0.97      0.51      0.50      7991
weighted avg       0.94      0.93      0.90      7991

Accuracy_score: 0.9310474283569015


In [15]:
svm = SVC()
svm.fit(tfidf_train_tweets, y_train)
y_pred2 = svm.predict(tfidf_cv_tweets)
print('Confusion matrix:', confusion_matrix(y_cv, y_pred2))
print('Classification_report:', classification_report(y_cv, y_pred2))
print('Accuracy_score:', accuracy_score(y_cv, y_pred2))

Confusion matrix: [[7415   17]
 [ 326  233]]
Classification_report:               precision    recall  f1-score   support

           0       0.96      1.00      0.98      7432
           1       0.93      0.42      0.58       559

    accuracy                           0.96      7991
   macro avg       0.94      0.71      0.78      7991
weighted avg       0.96      0.96      0.95      7991

Accuracy_score: 0.9570767113002128


In [16]:
lr = LogisticRegression()
lr.fit(tfidf_train_tweets, y_train)
y_pred3 = lr.predict(tfidf_cv_tweets)
print('Confusion matrix:', confusion_matrix(y_cv, y_pred3))
print('Classification_report:', classification_report(y_cv, y_pred3))
print('Accuracy_score:', accuracy_score(y_cv, y_pred3))

Confusion matrix: [[7414   18]
 [ 388  171]]
Classification_report:               precision    recall  f1-score   support

           0       0.95      1.00      0.97      7432
           1       0.90      0.31      0.46       559

    accuracy                           0.95      7991
   macro avg       0.93      0.65      0.72      7991
weighted avg       0.95      0.95      0.94      7991

Accuracy_score: 0.9491928419471906


In [17]:
submission["label"] =  svm.predict(tf1.transform(test["tweet_cleaned"]))

In [18]:
submission.head(10)

Unnamed: 0,id,label
0,31963,0
1,31964,1
2,31965,0
3,31966,0
4,31967,0
5,31968,0
6,31969,0
7,31970,0
8,31971,0
9,31972,0


In [19]:
submission.to_csv("submission.csv" , index = False)