In [17]:
import json
import codecs
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix



In [6]:
# 1.1. In HackerRank
# data = sys.stdin.readlines()
# data = [line.rstrip() for line in data]

# 1.2. Here in VScode
import codecs
file_name = 'quora_topics.json'
with codecs.open(file_name, 'r', encoding='iso-8859-1') as f:
    data = f.read()
# print(data)
data = data.split("\n")
# print(data) # Lists with every element being a string, as always

# at this point, ouput in Hackerrank and in Visual Studio es the same GREAAT

# 2. This part is to store our data in nice lists
N = int(data[0].strip())
topics = []
questions = []
excerpts = []
rows = []
for i in range(1,N+1):
    item = json.loads(data[i])
    
    topics.append(item['topic'])
    questions.append(item['question'])
    excerpts.append(item['excerpt'])

    row = [item['topic'], item['question'], item['excerpt']]
    rows.append(row)
colnames = ["topic", "question", "excerpt"]
training_df = pd.DataFrame(rows, 
                           columns= colnames)

#####
# We need to make the same but for testing data
N = int(data[0].strip())
topics = []
questions = []
excerpts = []
rows = []
for i in range(1,N+1):
    item = json.loads(data[i])
    
    topics.append(item['topic'])
    questions.append(item['question'])
    excerpts.append(item['excerpt'])

    row = [item['topic'], item['question'], item['excerpt']]
    rows.append(row)

colnames = ["topic", "question", "excerpt"]
testing_df = pd.DataFrame(rows, 
                           columns= colnames)

In [9]:
# Store data in nice panda data frames
colnames = ["topic", "question", "excerpt"]
# training_df = pd.DataFrame([questions, excerpts, topics], 
#                            columns= colnames)
training_df = pd.DataFrame(rows, 
                           columns= colnames)

In [14]:
def dataset_inspection(df):
        # Missingness
        missing_values = df.isnull().sum()
        print('Missing values: ', missing_values)

        # Duplicated rows
        print('Number of duplicated rows: ', df.duplicated().sum())
        print('Percentage of duplicated rows: ', (df.duplicated().sum() / df.apply(len)))

        # Count and percentage per label
        label_counts = df['topic'].value_counts()
        label_percentages = df['topic'].value_counts(normalize=True) * 100
        print('Label counts:')
        print(label_counts)
        print('Label percentages:')
        print(label_percentages)

        # Evaluate distribution of predictors
        predictors = "question"
        df['document_length'] = df[predictors].apply(len)
        print('Minimum document length:', df['document_length'].min())
        print('Maximum document length:', df['document_length'].max())
        print('Average document length:', df['document_length'].mean())
        print('Median document length:', df['document_length'].median())
        print('q1 document length:', df['document_length'].quantile(0.25))
        print('q3 document length:', df['document_length'].quantile(0.75))

In [15]:
dataset_inspection(training_df)
# Data seems balanced

Missing values:  topic              0
question           0
excerpt            0
question_length    0
dtype: int64
Number of duplicated rows:  5
Percentage of duplicated rows:  topic              0.000247
question           0.000247
excerpt            0.000247
question_length    0.000247
dtype: float64
Label counts:
gis            2383
scifi          2333
android        2239
electronics    2079
apple          2064
unix           1965
photo          1945
wordpress      1943
security       1899
mathematica    1369
Name: topic, dtype: int64
Label percentages:
gis            11.785944
scifi          11.538652
android        11.073743
electronics    10.282408
apple          10.208220
unix            9.718582
photo           9.619665
wordpress       9.609773
security        9.392156
mathematica     6.770859
Name: topic, dtype: float64
Minimum document length: 15
Maximum document length: 157
Average document length: 54.134032345813345
Median document length: 51.0
q1 document length: 39.0
q3 do

In [39]:
# 2 Build-up of the text classifier ##### 

# vectorizer = CountVectorizer()
vectorizer = TfidfVectorizer(lowercase=True, stop_words='english')

# classifier = MultinomialNB()
# classifier = LinearSVC(class_weight='balanced')
classifier = RandomForestClassifier(random_state=123)


# Split in test and training data 
X = vectorizer.fit_transform(training_df['question'] + ' ' + training_df['excerpt'])
y = training_df['topic']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# # Train the model in training data
classifier.fit(X_train, y_train)

# # Test in testing data AND obtaining classification_report
y_pred = classifier.predict(X_test)
report = classification_report(y_test, y_pred, digits=4)
print(report)

cm = confusion_matrix(y_test, y_pred)
print(cm)

              precision    recall  f1-score   support

     android     0.8799    0.8966    0.8882       899
       apple     0.8634    0.7815    0.8204       801
 electronics     0.8530    0.8020    0.8267       803
         gis     0.8944    0.8781    0.8862       993
 mathematica     0.7748    0.7818    0.7783       550
       photo     0.9004    0.9206    0.9104       756
       scifi     0.7855    0.9310    0.8521       956
    security     0.8552    0.6588    0.7443       762
        unix     0.7594    0.7668    0.7631       819
   wordpress     0.8385    0.9426    0.8875       749

    accuracy                         0.8408      8088
   macro avg     0.8404    0.8360    0.8357      8088
weighted avg     0.8426    0.8408    0.8392      8088

[[806  23  14   2   9   6  15   7  10   7]
 [ 30 626   3   3   7  14  21  23  53  21]
 [ 18   6 644  14  29   8  53   5  14  12]
 [  7   5  14 872  17   5  22   6  30  15]
 [  0   4   9  33 430   7  36   2  17  12]
 [  4   8  10   8   2 696 

In [36]:
X_eval = vectorizer.transform(testing_df['question'] + ' ' + testing_df['excerpt'])
y_eval = classifier.predict(X_eval)

for y in y_eval:
    print(y)

<2x2 sparse matrix of type '<class 'numpy.float64'>'
	with 2 stored elements in Compressed Sparse Row format>