## Assignment Sentiment Analysis

## Dataset - Kaggle - Twitter US Airline Sentiment

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
dataset = pd.read_csv("Data/Tweets.csv")

In [2]:
features = dataset.iloc[:, 10].values
labels = dataset.iloc[:, 1].values
import re
process_features = []

for tweet in range(0, len(features)):
    # Filtering out the special characters
    clean_tweet = re.sub(r'\W', ' ', str(features[tweet]))

    # Filtering out all single characters
    clean_tweet= re.sub(r'\s+[a-zA-Z]\s+', ' ', clean_tweet)

    # Filtering out single characters from the start
    clean_tweet = re.sub(r'\^[a-zA-Z]\s+', ' ', clean_tweet) 

    # Substituting multiple spaces with single space
    clean_tweet = re.sub(r'\s+', ' ', clean_tweet, flags=re.I)

    # Removing prefixed 'b'
    clean_tweet = re.sub(r'^b\s+', '', clean_tweet)

    # Converting to Lowercase
    clean_tweet = clean_tweet.lower()

    process_features.append(clean_tweet)

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(process_features, labels, test_size=0.3, random_state=10)

from sklearn.feature_extraction.text import TfidfVectorizer
spam_fil = TfidfVectorizer(max_df=0.8,min_df=2,stop_words='english')
#spam_fil.fit(X_train) 
#print(spam_fil.get_feature_names())

X_train = spam_fil.fit_transform(X_train).toarray()
X_test = spam_fil.transform(X_test).toarray()
X_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [4]:
#Using Naive Bayes
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

pred_test_nb = nb_model.predict(X_test)
print(accuracy_score(y_test, pred_test_nb))

0.4924863387978142


In [5]:
#using K-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
kneigh = KNeighborsClassifier(n_neighbors = 5)
kneigh.fit(X_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix
pred_test = kneigh.predict(X_test)
print(accuracy_score(y_test, pred_test))

0.694216757741348


In [6]:
from sklearn.ensemble import RandomForestClassifier

text_classifier = RandomForestClassifier(n_estimators=200, random_state=0)
text_classifier.fit(X_train, y_train)
predictions = text_classifier.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print(accuracy_score(y_test, predictions))

[[2560  104   67]
 [ 540  337   81]
 [ 220   89  394]]
              precision    recall  f1-score   support

    negative       0.77      0.94      0.85      2731
     neutral       0.64      0.35      0.45       958
    positive       0.73      0.56      0.63       703

    accuracy                           0.75      4392
   macro avg       0.71      0.62      0.64      4392
weighted avg       0.73      0.75      0.73      4392

0.7493169398907104
