In [1]:
#Utility Libaries
import pickle
import numpy as np
import pandas as pd

#Plotting Libaries
import seaborn as sns
import matplotlib.pyplot as plt

#Sklearn Libraries
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
#Loading Dataset for training, CSV File
encodingData = "ISO-8859-1"
data = pd.read_csv('C:/Datasets/TwitterData/processedData.csv', encoding=encodingData , low_memory=False)

In [None]:
#Graph the distribution, used for checking the dataset for distribution
#ax = data.groupby('sentiment').count().plot(kind='bar', title='Distribution of data', legend=False)
#ax.set_xticklabels(['Negative','Positive'], rotation=0)

In [None]:
#Splitting Data into Training and Testing Data
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['sentiment'], test_size = 0.05, random_state = 0)
print(f'Data Split Complete')

In [None]:
#Vector                
vector = TfidfVectorizer(ngram_range=(1,2), max_features=500000)
vector.fit(X_train.astype("U").str.lower())

print(f'Fitted')
print('Number of feature words: ', len(vector.get_feature_names()))

In [None]:
#Data Transformation
X_train = vector.transform(X_train)
X_test = vector.transform(X_test)
print(f'Data Transformed')

In [None]:
def model_Evaluate(model):
    
    # Predict values for Test dataset
    y_pred = model.predict(X_test)

    # Print the evaluation metrics for the dataset.
    print(classification_report(y_test, y_pred))
    
    # Compute and plot the Confusion matrix
    cf_matrix = confusion_matrix(y_test, y_pred)

    categories  = ['Negative','Positive']
    groupNames = ['True Neg','False Pos', 'False Neg','True Pos']
    groupPercentages = ['{0:.2%}'.format(value) for value in cf_matrix.flatten() / np.sum(cf_matrix)]

    labels = [f'{v1}\n{v2}' for v1, v2 in zip(groupNames,groupPercentages)]
    labels = np.asarray(labels).reshape(2,2)

    sns.heatmap(cf_matrix, annot = labels, cmap = 'Blues',fmt = '',
                xticklabels = categories, yticklabels = categories)

    plt.xlabel("Predicted values", fontdict = {'size':14}, labelpad = 10)
    plt.ylabel("Actual values"   , fontdict = {'size':14}, labelpad = 10)
    plt.title ("Confusion Matrix", fontdict = {'size':18}, pad = 20)

In [None]:
#Logistic Regression
LRmodel = LogisticRegression(C = 2, max_iter = 1000, n_jobs = -1)
LRmodel.fit(X_train, y_train)
model_Evaluate(LRmodel)