In [1]:
# imported the ncessary library that are needed
import re
import pandas as pd
from textblob import Word
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
from sklearn.metrics import confusion_matrix,cohen_kappa_score,accuracy_score

In [2]:
# Preprocessing the data -- removing the contractions, punctuations
def clean_str(string):
    string = re.sub(r"\'s", "", string)
    string = re.sub(r"\'ve", "", string)
    string = re.sub(r"n\'t", "", string)
    string = re.sub(r"\'re", "", string)
    string = re.sub(r"\'d", "", string)
    string = re.sub(r"\'ll", "", string)
    string = re.sub(r",", "", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", "", string)
    string = re.sub(r"\)", "", string)
    string = re.sub(r"\?", "", string)
    string = re.sub(r"'", "", string)
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"[0-9]\w+|[0-9]", "", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [3]:
# reading the csv file and then preprocessing each news article
data = pd.read_csv('C:/Users/Vijay/Desktop/code/code/dataset.csv',encoding = "ISO-8859-1")
x = data['news'].tolist()
y = data['type'].tolist()

for index, value in enumerate(x):
    x[index] = ' '.join([Word(word).lemmatize() for word in clean_str(value).split()])

In [4]:
# performed TF-IDF 
vect = TfidfVectorizer(stop_words='english', min_df=2)
X = vect.fit_transform(x)
Y = np.array(y)

In [5]:
print("no of features extracted:", X.shape[1])

no of features extracted: 14788


In [6]:
# divided the dataset into test and train using test_train_split()
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

print("train size:", X_train.shape)
print("test size:", X_test.shape)

train size: (1780, 14788)
test size: (445, 14788)


In [7]:
# traines the model and then tested on test data
model = RandomForestClassifier(n_estimators=300, max_depth=150, n_jobs=1)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [8]:
# saved the model for future reference 
joblib.dump(model, 'model.pkl')
joblib.dump(vect, 'vectorizer.pkl')

['vectorizer.pkl']

In [9]:
# calculated the confusion matrix, kappa score, accuracy score
c_mat = confusion_matrix(y_test, y_pred)
kappa = cohen_kappa_score(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
print("Confusion Matrix:\n", c_mat)
print("\nKappa: ", kappa)
print("\nAccuracy: ", acc)

Confusion Matrix:
 [[110   0   3   1   1]
 [  2  67   2   0   1]
 [  3   0  73   0   0]
 [  0   0   0 102   0]
 [  2   1   0   2  75]]

Kappa:  0.9489405645222979

Accuracy:  0.9595505617977528
