In [1]:
# Import libraries

import pandas as pd, numpy as np, json, re

#import nltk
#nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

In [2]:
# Create cleaning process
def clean(text):
    text=text.lower()
    text=re.sub('&lt;/?.*?&gt;',' &lt;&gt', text)
    text=re.sub('\\d|\\W+',' ',text)
    return text

# Create stop words list
stop_words = stopwords.words('english')

#### Create Classifiers

In [3]:
# Create Bernoulli NB classifier
bnb = BernoulliNB(class_prior=[0.25, 0.5])

# Create Multinomial NB classifier
mnb = MultinomialNB()

# Create Logistic Regression classifier (for Penalty l1 and l2)
cls1 = LogisticRegression(penalty = 'l1', solver = 'saga', multi_class = 'auto')
cls2 = LogisticRegression(penalty = 'l2', solver = 'saga', multi_class = 'auto')

### Categorized data

In [4]:
# Read the source data file for Categorized data
file = 'data/reddit/categorized-comments.jsonl'

data = []

with open(file) as f:
    for line in f:
        data.append(json.loads(line))
        
# Convert to Data Frame
category = pd.DataFrame(data)

In [5]:
# Check size of the total data
# Check structure
# Check categories
print('Size: ', len(category), '\n',
      'Shape: ', category.info(), '\n',
      'Categories: ', category.cat.unique())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2347476 entries, 0 to 2347475
Data columns (total 2 columns):
cat    object
txt    object
dtypes: object(2)
memory usage: 17.9+ MB
Size:  2347476 
 Shape:  None 
 Categories:  ['sports' 'science_and_technology' 'video_games' 'news']


In [6]:
# Since the size is humongus, I will take sample of all 4 categories. 
# By trial, sample of 1000 from each category can be easily handled by my machine
sample = category.groupby('cat').apply(lambda x :x.sample(1000))
del category
sample.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,cat,txt
cat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
news,1590595,news,Because sex and gender isn't the same thing?\n...
news,1575344,news,you been smoking some treated ganja dude
news,2231459,news,Funny how I've heard long winded discussions f...
news,2228315,news,Eh what does she actually do though?
news,2144152,news,"because it's not a one way street, sure it mig..."


In [7]:
# Clean the data
sample['txt'] = sample['txt'].apply(lambda x:clean(x))

In [8]:
# Create the feature matrix
cv = CountVectorizer(stop_words=stop_words)
txtvec = cv.fit_transform(sample['txt'])

In [9]:
# Create target and sample
target_cat = sample['cat']
features_cat = txtvec

# Create train test split
features_train, features_test, target_train, target_test = train_test_split(
    features_cat, target_cat, test_size=0.25, random_state=1)

In [10]:
# Create NB Multinomial model
model_catMNB = mnb.fit(features_cat, target_cat)

train_predMNB = model_catMNB.predict(features_train)
test_predMNB = model_catMNB.predict(features_test)

accuracy_train_catMNB = accuracy_score(target_train, train_predMNB)
accuracy_test_catMNB = accuracy_score(target_test, test_predMNB)

print('Cat Accuracy of Maultinomial training: ', accuracy_train_catMNB)
print('Cat Accuracy of Maultinomial test: ', accuracy_test_catMNB)

Cat Accuracy of Maultinomial training:  0.8236666666666667
Cat Accuracy of Maultinomial test:  0.845


In [11]:
# Create Logistic Regression model(s)
model_cat_LR1 = cls1.fit(features_train, target_train)
model_cat_LR2 = cls2.fit(features_train, target_train)

# Apply model to predict
test_pred1 = model_cat_LR1.predict(features_test)
train_pred1 = model_cat_LR1.predict(features_train)

test_pred2 = model_cat_LR2.predict(features_test)
train_pred2 = model_cat_LR2.predict(features_train)

# Measure accuracy
accuracy_test_catLR1 = accuracy_score(target_test, test_pred1)
accuracy_train_catLR1 = accuracy_score(target_train, train_pred1)

accuracy_test_catLR2 = accuracy_score(target_test, test_pred2)
accuracy_train_catLR2 = accuracy_score(target_train, train_pred2)

print('Cat Accuracy with penalty l1: ', accuracy_test_catLR1, accuracy_train_catLR1)
print('Cat Accuracy with penalty l2: ', accuracy_test_catLR2, accuracy_train_catLR2)



Cat Accuracy with penalty l1:  0.516 0.6706666666666666
Cat Accuracy with penalty l2:  0.549 0.8206666666666667


In [12]:
# Clear memory by unloading unnecessery data set

del sample

### Controversy Data

In [13]:
# Read the source data file for Categorized data
file = 'data/reddit/controversial-comments.jsonl'

data = []

with open(file) as f:
    for line in f:
        data.append(json.loads(line))
        
# Convert to Data Frame
controversy = pd.DataFrame(data)

In [14]:
# Check size of the total data
# Check structure
# Check categories
print('Size: ', len(controversy), '\n',
      'Shape: ', controversy.info(), '\n',
      'Categories: ', controversy.con.unique())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 950000 entries, 0 to 949999
Data columns (total 2 columns):
con    950000 non-null int64
txt    950000 non-null object
dtypes: int64(1), object(1)
memory usage: 10.9+ MB
Size:  950000 
 Shape:  None 
 Categories:  [0 1]


In [15]:
# Since the size is humongus, I will take sample of the 2 categories. 
# By trial, sample of 50000 from each category can be easily handled by my machine
size = 50000    # sample size
replace = True  # with replacement
fn = lambda obj: obj.loc[np.random.choice(obj.index, size, replace),:]

cont = controversy.groupby('con', as_index=False).apply(fn)

del controversy

cont.head()

Unnamed: 0,Unnamed: 1,con,txt
0,867329,0,"I know. Any way you slice it, he was lying."
0,342181,0,good
0,928492,0,"My point still stands, All these polls are utt..."
0,586530,0,"Hey, there's a political system that's yet to ..."
0,591565,0,Look again where the linked article is from...


In [16]:
# Clean the data
cont['txt'] = cont['txt'].apply(lambda x:clean(x))

In [17]:
# Create the feature matrix
cv = CountVectorizer(stop_words=stop_words)
txtvec = cv.fit_transform(cont['txt'])

In [18]:
# Create target and sample
target_con = cont['con']
features_con = txtvec

# Create train test split
features_train, features_test, target_train, target_test = train_test_split(
    features_con, target_con, test_size=0.25, random_state=1)

In [19]:
# Train model for Bernoulli
modelBNB = bnb.fit(features_con, target_con)

train_predBNB = modelBNB.predict(features_train)
test_predBNB = modelBNB.predict(features_test)

accuracy_train_conBNB = accuracy_score(target_train, train_predBNB)
accuracy_test_conBNB = accuracy_score(target_test, test_predBNB)

print('Con Accuracy of Bernoulli training: ', accuracy_train_conBNB)
print('Con Accuracy of Bernoulli test: ', accuracy_test_conBNB)

Con Accuracy of Bernoulli training:  0.6247733333333333
Con Accuracy of Bernoulli test:  0.62388


In [20]:
# Train model for Logistic Regression
model_con_LR1 = cls1.fit(features_train, target_train)
model_con_LR2 = cls2.fit(features_train, target_train)

# Apply model to predict
test_pred1 = model_con_LR1.predict(features_test)
train_pred1 = model_con_LR1.predict(features_train)

test_pred2 = model_con_LR2.predict(features_test)
train_pred2 = model_con_LR2.predict(features_train)

# Measure accuracy
accuracy_test_conLR1 = accuracy_score(target_test, test_pred1)
accuracy_train_conLR1 = accuracy_score(target_train, train_pred1)

accuracy_test_conLR2 = accuracy_score(target_test, test_pred2)
accuracy_train_conLR2 = accuracy_score(target_train, train_pred2)

print('Con Accuracy with penalty l1: ', accuracy_test_conLR1, accuracy_train_conLR1)
print('Con Accuracy with penalty l2: ', accuracy_test_conLR2, accuracy_train_conLR2)



Con Accuracy with penalty l1:  0.61688 0.65192
Con Accuracy with penalty l2:  0.62112 0.6608933333333333


In [21]:
# Create final output data frame for accuracy
accuracy = {'Model':['Logistic Regression(L1)',
                     'Logistic Regression(L2)',
                     'Naive Bayes',
                     'Logistic Regression(L1)',
                     'Logistic Regression(L2)',
                     'Naive Bayes',
                    ], 
            'Data Set':['Controversy',
                        'Controversy',
                        'Controversy',
                        'Category',
                        'Category',
                        'Category',
                       ],
            'Accuracy_Train':[accuracy_train_conLR1,
                              accuracy_train_conLR2,
                              accuracy_train_conBNB,
                              accuracy_train_catLR1,
                              accuracy_train_catLR2,
                              accuracy_train_catMNB
                             ],
            'Accuracy_Test':[accuracy_test_conLR1,
                              accuracy_test_conLR2,
                              accuracy_test_conBNB,
                              accuracy_test_catLR1,
                              accuracy_test_catLR2,
                              accuracy_test_catMNB
                            ]
           } 
  
# Create DataFrame 
df_accuracy = pd.DataFrame(accuracy)
  
# Print the output
df_accuracy

Unnamed: 0,Model,Data Set,Accuracy_Train,Accuracy_Test
0,Logistic Regression(L1),Controversy,0.65192,0.61688
1,Logistic Regression(L2),Controversy,0.660893,0.62112
2,Naive Bayes,Controversy,0.624773,0.62388
3,Logistic Regression(L1),Category,0.670667,0.516
4,Logistic Regression(L2),Category,0.820667,0.549
5,Naive Bayes,Category,0.823667,0.845


In [22]:
# Clear memory
del cont

**End of code**