<a href="https://colab.research.google.com/github/anky19698/Twitter-Sentiment-Analysis/blob/main/Twitter_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Twitter Sentiment Analysis

## Importing Libraries

In [None]:
import numpy as np
import pandas as pd
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import spacy
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import CountVectorizer

## Using Spacy for NLP

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
data = pd.read_csv('twitter_training.csv', names=['tweet_id', 'entity', 'sentiment', 'tweet_text'])
data.head()


Unnamed: 0,tweet_id,entity,sentiment,tweet_text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [None]:
def process_text(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    return ' '.join(filtered_tokens)

In [None]:
data.shape

(74682, 4)

In [None]:
data.dropna(inplace=True)

In [None]:
data["processed_tweet"] = data["tweet_text"].apply(process_text)

In [None]:
df = data[["processed_tweet", "sentiment"]]
df.head()

Unnamed: 0,processed_tweet,sentiment
0,m get borderland murder,Positive
1,come border kill,Positive
2,m get borderland kill,Positive
3,m come borderland murder,Positive
4,m get borderland 2 murder,Positive


In [None]:
df.sentiment.value_counts()

sentiment
Negative      22358
Positive      20655
Neutral       18108
Irrelevant    12875
Name: count, dtype: int64

## Defining X and y

In [None]:
X = df["processed_tweet"]
y = df["sentiment"].map({"Positive": 1, "Negative": 2, "Neutral": 3, "Irrelevant": 0})

## Train Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Builing ML Models

### Multinomial Naive Bayes

In [None]:
clf_nb = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', MultinomialNB())
])

clf_nb.fit(X_train, y_train)

y_pred = clf_nb.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.44      0.60      2696
           1       0.69      0.80      0.74      4119
           2       0.64      0.89      0.75      4380
           3       0.83      0.63      0.72      3605

    accuracy                           0.72     14800
   macro avg       0.78      0.69      0.70     14800
weighted avg       0.76      0.72      0.71     14800



### Random Forest Classier

In [None]:
clf_rf = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', RandomForestClassifier())
])

clf_rf.fit(X_train, y_train)

y_pred = clf_rf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.84      0.90      2696
           1       0.85      0.95      0.90      4119
           2       0.91      0.94      0.92      4380
           3       0.94      0.89      0.91      3605

    accuracy                           0.91     14800
   macro avg       0.92      0.90      0.91     14800
weighted avg       0.91      0.91      0.91     14800



In [None]:
clf_rfcv = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('classifier', RandomForestClassifier())
])

clf_rfcv.fit(X_train, y_train)

y_pred = clf_rfcv.predict(X_test)

print(classification_report(y_test, y_pred))

### Logistic Regression

In [None]:
clf_lr = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', LogisticRegression())
])

clf_lr.fit(X_train, y_train)

y_pred = clf_lr.predict(X_test)

print(classification_report(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

           0       0.82      0.65      0.72      2696
           1       0.72      0.83      0.77      4119
           2       0.78      0.84      0.81      4380
           3       0.80      0.72      0.76      3605

    accuracy                           0.77     14800
   macro avg       0.78      0.76      0.77     14800
weighted avg       0.78      0.77      0.77     14800



### XG Boost Classifier

In [None]:
clf_gnb = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', XGBClassifier())
])

clf_gnb.fit(X_train, y_train)

y_pred = clf_gnb.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.73      0.33      0.46      2696
           1       0.64      0.69      0.66      4119
           2       0.58      0.84      0.68      4380
           3       0.70      0.55      0.62      3605

    accuracy                           0.63     14800
   macro avg       0.66      0.60      0.61     14800
weighted avg       0.65      0.63      0.62     14800



## Exporting Trained Model

In [None]:
import pickle

pickle.dump(clf_rf, open('model.pkl', 'wb'))

## Using GridSearchCV to Find Best Model

In [None]:
models = {
    'svm': {
        'model': SVC(),
        'params': {
            'C': [1, 10, 20],
            'kernel': ['linear', 'rbf', 'poly']
        }
    },

    'logistic_regression': {
        'model': LogisticRegression(),
        'params': {
            'C': [1, 5, 10]
        }
    },

    'random_forest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [10, 50, 100]
        }
    },

    'naive_bayes': {
        'model': MultinomialNB(),
        'params': {}
    },

    'xgboost': {
        'model': XGBClassifier(),
        'params': {
            'n_estimators': [10, 50, 100]
        }
    }
}

In [None]:
scores = []

for model_name, model_params in models.items():
    clf = GridSearchCV(model_params['model'], model_params['params'], cv=5, return_train_score=False)
    x_vector = TfidfVectorizer()
    X_train_vectorized = x_vector.fit_transform(X_train)
    clf.fit(X_train_vectorized, y_train)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

scores_df = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
scores_df

## Validation Data

In [None]:
df_val = pd.read_csv('twitter_validation.csv', names=['tweet_id', 'entity', 'sentiment', 'tweet_text'])

In [None]:
df_val.dropna(inplace=True)

In [None]:
df_val.shape

(1000, 4)

In [None]:
df_val["processed_tweet"] = df_val["tweet_text"].apply(process_text)

In [None]:
X_val = df_val["processed_tweet"]
y_val = df_val["sentiment"].map({"Positive": 1, "Negative": 2, "Neutral": 3, "Irrelevant": 0})

In [None]:
model = pickle.load(open('model.pkl', 'rb'))

In [None]:
y_pred = model.predict(X_val)

print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.90      0.94       172
           1       0.95      0.95      0.95       277
           2       0.94      0.96      0.95       266
           3       0.93      0.95      0.94       285

    accuracy                           0.94      1000
   macro avg       0.95      0.94      0.94      1000
weighted avg       0.95      0.94      0.94      1000

