# Predict The News Category : Anshul Patel

#### Import General Libraries

In [None]:
from nltk.tokenize import word_tokenize
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib
from imblearn.over_sampling import SMOTE

#### Import Training Data

In [None]:
docs = pd.read_excel('Data_Train.xlsx',sheet_name='Sheet1')
docs.head()

In [None]:
y = docs['SECTION']
y = y.to_numpy()

#### Preprocessing Text Data

In [None]:
def preprocess(message):
    message = str(message)
    message = message.lower()
    message = re.sub('[^a-zA-Z]',' ', message)
    words = word_tokenize(message)
    sw = stopwords.words('english')
    words = [word for word in words if word not in sw]
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    words = ' '.join(words)
    return words

In [None]:
X_og = []
X = []
for msg in docs['STORY']:
    word_list = preprocess(msg)
    X_og.append(msg)
    X.append(word_list)

#### Train-Test Split

In [None]:
X_train_og, X_test_og, y_train, y_test = train_test_split(X_og, y,random_state = 42, test_size=0.33,stratify = y)  
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state = 42, test_size=0.33,stratify = y)  
unique_elements, counts_elements = np.unique(y_train, return_counts=True)
print("Frequency of unique values of the said array:")
print(np.asarray((unique_elements, counts_elements)))

#### Generate TF-IDF Vectors

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf=TfidfVectorizer(max_features=3000)
X_train_tf=tf.fit_transform(X_train)
unique=tf.get_feature_names()
dense = X_train_tf.todense()
denselist = dense.tolist()
train_df = pd.DataFrame(denselist, columns=unique)

X_test_tf = tf.transform(X_test)
dense = X_test_tf.todense()
denselist = dense.tolist()
test_df = pd.DataFrame(denselist, columns=unique)

#### Apply SMOTE on train-data

In [None]:
sm = SMOTE('auto')
x_res, y_res = sm.fit_sample(train_df, y_train)

unique_elements, counts_elements = np.unique(y_res, return_counts=True)
print("Frequency of unique values of the said array:")
print(np.asarray((unique_elements, counts_elements)))

#### Import Models

In [None]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier

#### Fit on Stacking Classifier

In [None]:
from sklearn.ensemble import StackingClassifier
estimators = [('SVC', SVC(kernel = 'rbf', random_state = 0, probability = True)),('XG', XGBClassifier()),('RF', RandomForestClassifier(random_state=0)), ('LR',LogisticRegression()),('GBC', GradientBoostingClassifier(n_estimators=300, learning_rate=1.0,max_depth=5, random_state=0)) ]
clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
clf.fit(x_res,y_res)        
ypred1 = clf.predict(test_df)
ypred_prob1 = clf.predict_proba(test_df)
print(classification_report(y_test,ypred1)) 

#### Save Model

In [None]:
#Saving the stacking model
filename = 'PTNC_sclf.sav'
joblib.dump(clf, filename)

filename = 'PTNC_Stacking_TFIDF.sav'
joblib.dump(tf, filename)

#### Produce results on Test Data

In [None]:
test_docs = pd.read_excel('Data_Test.xlsx',sheet_name='Sheet1')
X_T_og = []
X_T = []
for msg in test_docs['STORY']:
    word_list = preprocess(msg)
    X_T_og.append(msg)
    X_T.append(word_list)
    
X_test_tf = tf.transform(X_T)
dense = X_test_tf.todense()
denselist = dense.tolist()
test_df_T = pd.DataFrame(denselist, columns=unique)
ypred_T = clf.predict(test_df_T)
df_T = pd.DataFrame(ypred_T)
df_T.to_excel(excel_writer = "SmoteResult.xlsx")

## RESULT ON PUBLIC DATA : 0.97452