In [1]:
import pandas as pd
import numpy as np
import nltk 
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn import preprocessing
from sklearn import utils
import re 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Avyakta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# reading training data
data = pd.read_csv('data_twitter_sentiment/semeval_train.txt',sep='\t',names=["sentiment","tweet"])
# data.head()
# reading test data
dt1 = pd.read_csv('data_twitter_sentiment/Twitter2013_raw.txt',sep='\t',names=["sentiment","tweet"])
dt2 = pd.read_csv('data_twitter_sentiment/Twitter2014_raw.txt',sep='\t',names=["sentiment","tweet"])
dt3 = pd.read_csv('data_twitter_sentiment/Twitter2015_raw.txt',sep='\t',names=["sentiment","tweet"])
dt4 = pd.read_csv('data_twitter_sentiment/Twitter2016_raw.txt',sep='\t',names=["sentiment","tweet"])

# dt = pd.concat([dt1, dt2, dt3, dt4])
dt = dt1

In [3]:
data['sentiment'].value_counts()

neutral     4099
positive    3227
negative    1262
Name: sentiment, dtype: int64

In [5]:
def preprocess(features):
    processed_features = []
    for sentence in range(0, len(features)):
        # remove hyperlinks, tags, hashtags
        processed_feature = ' ' + str(features[sentence]) + ' '
        processed_feature = re.sub(r'http*\S+', ' ', processed_feature) 
        processed_feature = re.sub(r'https*\S+', ' ', processed_feature)
        processed_feature = re.sub(r'@\S+', ' ', processed_feature)
        processed_feature = re.sub(r'#\S+', ' ', processed_feature)
        processed_feature = re.sub(r'\bhm*\s+', '', processed_feature)
        
        # remove all digits
        processed_feature = re.sub(r'[0-9]', ' ', processed_feature)
        processed_feature = re.sub(r'[_]', ' ', processed_feature)
        
        # Remove all the special characters
        processed_feature = re.sub(r'\W', ' ', processed_feature)
        
        # remove all single characters
        processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)

        # Remove single characters from the start
        processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 

        # Substituting multiple spaces with single space
        processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)

        # Converting to Lowercase
        processed_feature = processed_feature.lower()

        # remove some meaningless words
        processed_feature = re.sub(r'(\s)aa\w+', ' ', processed_feature)
        processed_feature = re.sub(r'(\s)ba(\s)', ' ', processed_feature)
        processed_feature = re.sub(r'(\s)th(\s)', ' ', processed_feature)

        processed_features.append(processed_feature)
    return processed_features

In [6]:
# data cleaning
train_features = preprocess(data.iloc[:, 1].values)
test_features = preprocess(dt.iloc[:, 1].values)

# vectorizing data
vectorizer = TfidfVectorizer(stop_words='english', min_df=2, max_df=0.95, norm = 'l2',ngram_range=(1, 4)).fit(train_features)
# v = vectorizer
cv_array = vectorizer.transform(train_features).toarray()
cvt_array = vectorizer.transform(test_features).toarray()

print( cv_array.shape, sep='\n')
sparsity = 1.0 - ( np.count_nonzero(cv_array) / float(cv_array.size) )
print('Sparsity of training features is ', sparsity*100, '%')

# ch2 = SelectKBest(chi2, k=500)
# cv_array = ch2.fit_transform(cv_array, data['sentiment'])
# cvt_array = ch2.transform(cvt_array)

# svd = TruncatedSVD(n_components=300, random_state=42)
# s = svd.fit(cv_array)
# cv_array = s.transform(cv_array) 
# cvt_array = s.transform(cvt_array)

# label encoding
lab_enc = preprocessing.LabelEncoder().fit(data['sentiment'])

x_train = cv_array
label_train = data['sentiment']
y_train = lab_enc.transform(label_train)

x_test = cvt_array
label_test = dt['sentiment']
y_test = lab_enc.transform(label_test)

(8588, 17797)
Sparsity of training features is  99.92719082901486 %


In [10]:
from sklearn.feature_selection import chi2
import numpy as np
N = 10
labels = ['positive', 'negative', 'neutral']

for i in range(0,3):
    features_chi2 = chi2(cv_array, data['sentiment'] == labels[i])
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(vectorizer.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("\n {} Label Most correlated unigrams for:\n. {}".format(labels[i],'\n. '.join(unigrams[-N:])))
    print(" {} Label Most correlated bigrams:\n. {}".format(labels[i],'\n. '.join(bigrams[-N:])))



 positive Label Most correlated unigrams for:
. amazing
. thanks
. fun
. best
. wait
. excited
. great
. happy
. love
. good
 positive Label Most correlated bigrams:
. celebrity juice
. good night
. great day
. sm bacolod
. happy friday
. good day
. good morning
. looking forward
. good luck
. happy birthday

 negative Label Most correlated unigrams for:
. didn
. stupid
. sorry
. hate
. don
. worse
. sad
. shit
. bad
. fuck
 negative Label Most correlated bigrams:
. cancelled tomorrow
. watching game
. really don
. tv time
. feel bad
. breakout kings
. don like
. nets game
. don wanna
. don want

 neutral Label Most correlated unigrams for:
. hope
. thanks
. fun
. best
. excited
. wait
. great
. love
. happy
. good
 neutral Label Most correlated bigrams:
. sm bacolod
. white arrows
. good day
. daily zap
. happy friday
. george clooney
. good morning
. good luck
. looking forward
. happy birthday


Training Data with evaluation

In [10]:
from sklearn.linear_model import LogisticRegression
import sklearn.metrics as metrics

lr = LogisticRegression(solver = 'newton-cg', random_state = 1, max_iter=500, penalty = 'l2', C = 1.8)
lr.fit(x_train, y_train)
y_train_pred = lr.predict(x_train)
y_pred = lr.predict(x_test)

print("Train accuracy: ",round(metrics.accuracy_score(y_train,y_train_pred),3))
print("Test accuracy: ",round(metrics.accuracy_score(y_test,y_pred),3))
print("F1: ",round(metrics.f1_score(y_test, y_pred, average = 'weighted'),3))

Train accuracy:  0.889
Test accuracy:  0.644
F1:  0.624


In [11]:
from sklearn import svm

linear = svm.SVC(kernel='linear', C=1, decision_function_shape='ovo').fit(x_train, y_train)
# rbf = svm.SVC(kernel='rbf', gamma=1, C=5, decision_function_shape='ovo').fit(x_train, y_train)
# sig = svm.SVC(kernel='sigmoid', C=1, decision_function_shape='ovo').fit(x_train, y_train)

linear_pred = linear.predict(x_test)

print('Train Accuracy Linear Kernel:',linear.score(x_train, y_train))
print('Test Accuracy Linear Kernel:', linear.score(x_test, y_test))
print("F1 Score Linear Kernel: ",round(metrics.f1_score(y_test, linear_pred, average = 'weighted'),3))

Train Accuracy Linear Kernel: 0.8557289240801118
Test Accuracy Linear Kernel: 0.6419202518363064
F1 Score Linear Kernel:  0.621


In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

text_classifier = RandomForestClassifier(n_estimators=50, random_state=0, max_depth=400, max_features = 900, min_samples_leaf = 2)
text_classifier.fit(x_train, label_train)

predictions = text_classifier.predict(x_test)
y_train_pred = text_classifier.predict(x_train)

print("Train accuracy: ",round(accuracy_score(label_train,y_train_pred),3))
print("Train F1: ",round(f1_score(label_train, y_train_pred, average = 'weighted'),3))

print(confusion_matrix(label_test,predictions))
# print(classification_report(label_test,predictions))
print('Test accuracy: {}'.format(accuracy_score(label_test, predictions)))
print("Test F1: ",round(f1_score(label_test, predictions, average = 'weighted'),3))

Train accuracy:  0.854
Train F1:  0.85
[[ 144  359   98]
 [  38 1398  203]
 [  48  583  941]]
Test accuracy: 0.6513641133263379
Test F1:  0.633


In [23]:
#  Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
mnb = MultinomialNB(class_prior = [50, 40, 40])
mnb.fit(x_train, label_train)

predictions = mnb.predict(x_test)
y_train_pred = mnb.predict(x_train)

print("Train accuracy: ",round(accuracy_score(label_train,y_train_pred),3))
# print(confusion_matrix(label_test,predictions))
print(classification_report(label_test,predictions))
print('Test accuracy: {}'.format(accuracy_score(label_test, predictions)))
print("Test F1: ",round(f1_score(label_test, predictions, average = 'weighted'),3))

Train accuracy:  0.841
              precision    recall  f1-score   support

    negative       0.53      0.29      0.37       601
     neutral       0.60      0.69      0.64      1639
    positive       0.65      0.65      0.65      1572

    accuracy                           0.61      3812
   macro avg       0.59      0.54      0.55      3812
weighted avg       0.61      0.61      0.60      3812

Test accuracy: 0.6114900314795383
Test F1:  0.602


In [8]:
# from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

knn = KNeighborsClassifier()
param_grid = {'n_neighbors': np.arange(24, 30)}
#use gridsearch to test all values for n_neighbors
knn_gscv = GridSearchCV(knn, param_grid, cv=5)
knn_gscv.fit(x_train,label_train)
predictions = knn_gscv.predict(x_test)
y_train_pred = knn_gscv.predict(x_train)

print("Train accuracy: {}",round(accuracy_score(label_train,y_train_pred),3))

# print(confusion_matrix(label_test,predictions))
print(classification_report(label_test,predictions))
print('Test accuracy: '.format(accuracy_score(label_test, predictions)))
print("Test F1: ",round(f1_score(label_test, predictions, average = 'weighted'),3))

In [None]:
#check top performing n_neighbors value
knn_gscv.best_params_