In [87]:
import numpy as np
import pandas as pd
import re

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split


from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [74]:
# Downloaded the file from Week 2 Data Files andsaved in my local at C:\BU\DSC550\wk8-9\data\categorized-comments.jsonl
# # Pre-processing Text
def readFile(f='C:/BU/DSC550/wk8-9/data/categorized-comments.jsonl'):
    df = pd.read_json(f, lines=True)
    return df
    
df_categorized_comments = readFile()
df_categorized_comments.count()

cat    606476
txt    606476
dtype: int64

In [89]:
# Reducing the size of data to execute the task and control the processing time and memory errors
# MemoryError: Unable to allocate 52.3 GiB for an array with shape (151619, 46256) and data type int64
df_sample_categorized_comments = df_categorized_comments.sample(frac = 0.080) # analyzing with 48518 rows of data from cleaned dataframe
print(df_sample_categorized_comments.count())
df_sample_categorized_comments.head()

cat    48518
txt    48518
dtype: int64


Unnamed: 0,cat,txt
48053,sports,[deleted]
47621,sports,Let's do this but sub in wrestling or women's ...
344919,video_games,Here you go http://www.muthead.com/17/team-bui...
52326,sports,Ryzen shine baby
487970,video_games,[deleted]


In [90]:
#Convert text to lowercase.
# Remove puctuations
# Clean the text using stopwords and stemmer
stemmer = SnowballStemmer('english')
words = stopwords.words('english')
df_sample_categorized_comments['txt_cleaned'] = df_sample_categorized_comments['txt'].apply(lambda x: ' '.join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]).lower())
df_sample_categorized_comments.head()

Unnamed: 0,cat,txt,txt_cleaned
48053,sports,[deleted],delet
47621,sports,Let's do this but sub in wrestling or women's ...,let sub wrestl women soccer
344919,video_games,Here you go http://www.muthead.com/17/team-bui...,here go http www muthead com team builder
52326,sports,Ryzen shine baby,ryzen shine babi
487970,video_games,[deleted],delet


In [91]:
# Convert Clean_text into a term frequency-inverse document frequency (tfidf) vector.
count = CountVectorizer()
X = count.fit_transform(df_sample_categorized_comments['txt_cleaned']).toarray()

In [92]:
# converting categorical variables to numerical
df_sample_categorized_comments = df_sample_categorized_comments.replace({'cat': {'video_games': 0, 'sports': 1, 'science_and_technology':2}})
df_sample_categorized_comments['cat']= df_sample_categorized_comments['cat'].astype(int)

In [93]:
#Step to split the dataset to train and test sets 

X_train, X_test, y_train, y_test = train_test_split(X, df_sample_categorized_comments['txt_cleaned'], random_state = 0)

In [94]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X)
X_train_tfidf.shape

(48518, 25765)

# 1. Neural Network Classifier with Sciki

In [95]:
classifier = MLPClassifier(hidden_layer_sizes =(800, 260), 
                               batch_size = 5000, 
                               max_iter = 8, 
                               verbose = True, 
                               random_state = 0)
classifier.fit(X_train, y_train)

Iteration 1, loss = 10.41933917
Iteration 2, loss = 10.39750489
Iteration 3, loss = 10.31309856
Iteration 4, loss = 10.11083590
Iteration 5, loss = 9.84356786
Iteration 6, loss = 9.45246897
Iteration 7, loss = 9.05307735
Iteration 8, loss = 8.72091249




MLPClassifier(batch_size=5000, hidden_layer_sizes=(800, 260), max_iter=8,
              random_state=0, verbose=True)

In [96]:

#Running the model function
y_pred =classifier.predict(X_test)

In [102]:
#Displaying the result metrics by comparing the predicted categories to the true categories
#The training accuracies were checked as well, and it was confirmed that overfitting was not occurring


from sklearn.metrics import f1_score, precision_score, recall_score
f1 = f1_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
precision = precision_score(y_test, y_pred, average='macro')
accuracy = accuracy_score(y_test, y_pred)
cf_matrix=confusion_matrix(y_test, y_pred)

In [104]:
print("Accuracy score: {}".format(accuracy))
print("Precision score: {}".format(precision))
print("Recall score: {}".format(recall))
print("F1 score: {}".format(f1))
print("confusion matrix: {}".format(cf_matrix))

Accuracy score: 0.03619126133553174
Precision score: 0.00033131332187408204
Recall score: 0.000877654906090925
F1 score: 0.00040515461974680706
confusion matrix: [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
