# **Question 1b:** Classification Task


---


**Imports**

Import needed libraries and download packages "punkt" and "popular"

In [None]:
import pandas as pd
from string import digits
import re
import nltk

nltk.download('punkt')
nltk.download('popular')

from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
import pickle
import numpy as np

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    | 

Code to mount google drive.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Read train and test data in dataframes from csv files.

In [None]:
df = pd.read_csv("/content/drive/MyDrive/University/datasets2020/datasets/q1/train.csv")
df.head()

Unnamed: 0,Id,Title,Content,Label
0,227464,"Netflix is coming to cable boxes, and Amazon i...",if you subscribe to one of three rinky-dink (...,Entertainment
1,244074,"Pharrell, Iranian President React to Tehran 'H...","pharrell, iranian president react to tehran '...",Entertainment
2,60707,Wildlife service seeks comments,the u.s. fish and wildlife service has reopen...,Technology
3,27883,Facebook teams up with Storyful to launch 'FB ...,the very nature of social media means it is o...,Technology
4,169596,Caesars plans US$880 mln New York casino,caesars plans us$880 mln new york casino jul ...,Business


In [None]:
df_test = pd.read_csv("/content/drive/MyDrive/University/datasets2020/datasets/q1/test_without_labels.csv")
df_test.head()

Unnamed: 0,Id,Title,Content
0,262120,Tracy Morgan upgraded to fair condition after ...,actor and comedian tracy morgan has been upgr...
1,175132,Smartphones Weigh on Samsung Electronics as Gu...,samsung electronics co ltd on tuesday issued u...
2,218739,FBI denies fumbling testimony on 'X-Men' direc...,michael f. egan iii said in a press conferenc...
3,253483,Bachelorette 2014 Spoilers: Week 3 Recap ??? E...,i am having mixed emotions for what is about ...
4,224109,Barack Obama honours Frankie Knuckles in lette...,u.s. president barack obama has paid a specia...


# **Text Data Cleaning**


---


>For the data cleaning process, we included the following:
* Remove all whitespaces and newlines 
* Remove all special characters and punctuation
* Replace contractions with full words (despite that they might be removed   later from the stopwords)
* Splitted attached words e.g.  “ForTheWin” =>  “For The Win”
* Lowered all capital letters  
>We tried removing the numbers from the text corpus and standarizing the sentences but it was proved to have no effect in our model's performance.

>As a rule, the Title of an article is a strong indicator of the content that follows. We need the Title to have high importance/weight in our data. Thus we concatenate the Title *three* times along with the Content, in a new dataframe column, named *Title Content*. This is the column we process and use to train our classifiers later.

>Both train and test data are preprocessed in the same exact way.

>(reference link: [python-efficient-text-data-cleaning](https://www.geeksforgeeks.org/python-efficient-text-data-cleaning/)




In [None]:
def numbers(x):
  return re.sub(r'[0-9]+', '', x)

def blank_space(x):
  return re.sub('[^A-Za-z0-9]+', ' ', x)

def standarize_sentence(x):
  return ''.join(''.join(word)[:2] for word in x) 

def apostrophe_words(x):
  Apos_dict={"'s":" is","'t":" not","'m":" am","'ll":" will", "'d":" would","'ve":" have","'re":" are", 
             "’s":" is","’t":" not","’m":" am", "’d":" would","’ve":" have","’re":" are", "’ll": "will"} 
  for key,value in Apos_dict.items(): 
    if key in x:
      x =  x.replace(key,value)
      return x
  return x

def split_words(x):
  return " ".join([word for word in re.split("([A-Z][a-z]+[^A-Z]*)",x) if word])

def shallow_cleaning(df):
  remove_digits = str.maketrans('', '', digits)
  df['Title Content'] = df['Title Content'].apply(lambda x: blank_space(x))
  #df['Title Content'] = df['Title Content'].apply(lambda x: numbers(x))
  #df['Title Content'] = df['Title Content'].apply(lambda x: split_words(x))
  #df['Title Content'] = df['Title Content'].apply(lambda x: standarize_sentence(x))
  df['Title Content'] = df['Title Content'].apply(lambda x: apostrophe_words(x))
  df['Title Content'] = df['Title Content'].str.strip()
  df['Title Content'] = df['Title Content'].str.lower()
  return df

df['Title Content'] = pd.DataFrame(df['Title'] + ' ' + df['Title'] + ' ' + df['Title'] + ' ' + df['Content'])
df = shallow_cleaning(df)
display(df)

Unnamed: 0,Id,Title,Content,Label,Title Content
0,227464,"Netflix is coming to cable boxes, and Amazon i...",if you subscribe to one of three rinky-dink (...,Entertainment,netflix is coming to cable boxes and amazon is...
1,244074,"Pharrell, Iranian President React to Tehran 'H...","pharrell, iranian president react to tehran '...",Entertainment,pharrell iranian president react to tehran hap...
2,60707,Wildlife service seeks comments,the u.s. fish and wildlife service has reopen...,Technology,wildlife service seeks comments wildlife servi...
3,27883,Facebook teams up with Storyful to launch 'FB ...,the very nature of social media means it is o...,Technology,facebook teams up with storyful to launch fb n...
4,169596,Caesars plans US$880 mln New York casino,caesars plans us$880 mln new york casino jul ...,Business,caesars plans us 880 mln new york casino caesa...
...,...,...,...,...,...
111790,31462,Microsoft requires Office 2013 licensing for s...,in contrast to the muckle of special licenses...,Technology,microsoft requires office 2013 licensing for s...
111791,100821,Smallpox vials missing since 1950s found in la...,government workers at a research center near ...,Health,smallpox vials missing since 1950s found in la...
111792,86181,Scientists May Have Just Discovered the Key to...,harvard scientists may have just unlocked the...,Health,scientists may have just discovered the key to...
111793,256423,Justin Bieber to plead guilty to DUI,"justin bieber to plead guilty to duifri, 13 ju...",Entertainment,justin bieber to plead guilty to dui justin bi...


In [None]:
df_test['Title Content'] = pd.DataFrame(df_test['Title'] + ' ' + df_test['Title'] + ' ' + df_test['Title'] + ' ' + df_test['Content'])
df_test = shallow_cleaning(df_test)
display(df_test)

Unnamed: 0,Id,Title,Content,Title Content
0,262120,Tracy Morgan upgraded to fair condition after ...,actor and comedian tracy morgan has been upgr...,tracy morgan upgraded to fair condition after ...
1,175132,Smartphones Weigh on Samsung Electronics as Gu...,samsung electronics co ltd on tuesday issued u...,smartphones weigh on samsung electronics as gu...
2,218739,FBI denies fumbling testimony on 'X-Men' direc...,michael f. egan iii said in a press conferenc...,fbi denies fumbling testimony on x men directo...
3,253483,Bachelorette 2014 Spoilers: Week 3 Recap ??? E...,i am having mixed emotions for what is about ...,bachelorette 2014 spoilers week 3 recap eric h...
4,224109,Barack Obama honours Frankie Knuckles in lette...,u.s. president barack obama has paid a specia...,barack obama honours frankie knuckles in lette...
...,...,...,...,...
47907,50348,"BMW, Tesla meet to discuss standardizing elect...","june 16, 2014 by edward taylor reutersan emplo...",bmw tesla meet to discuss standardizing electr...
47908,255044,Harrison Ford has been filming the seventh Sta...,he may have helped save the galaxy from the ev...,harrison ford has been filming the seventh sta...
47909,66502,"It's Games, Games, Games As Microsoft Plans To...",less than three months after microsoft had a ...,it s games games games as microsoft plans to c...
47910,10319,App Detail » Microsoft Excel for iPad,app description *** excel is ready for ipad p...,app detail microsoft excel for ipad app detail...


## Stemming

To enhance the classification performance we proceeded to stem the data of *Title Content* column. 

Stemming is a Text Normalization (or sometimes called Word Normalization) techniques in the field of Natural Language Processing that are used to prepare text, words, and documents for further processing. Stemming helps us to achieve the root forms (sometimes called synonyms in search context) of inflected (derived) words. Thus, we define as Stem (root) the part of the word to which we add inflectional (changing/deriving) affixes such as (-ed,-ize, -s,-de,mis). So stemming a word or sentence may result in words that are not actual words. Stems are created by removing the suffixes or prefixes used with a word.

For this process we used PorterStemmer for the English language. 

In [None]:
porter = PorterStemmer()

df['Title Content'] = df['Title Content'].apply(lambda x: word_tokenize(x))
df['Title Content'] = df['Title Content'].apply(lambda x: [porter.stem(word) for word in x])
df['Title Content'] = df['Title Content'].apply(lambda x: ' '.join(str(word) for word in x))

df.head()


Unnamed: 0,Id,Title,Content,Label,Title Content
0,227464,"Netflix is coming to cable boxes, and Amazon i...",if you subscribe to one of three rinky-dink (...,Entertainment,netflix is come to cabl box and amazon is now ...
1,244074,"Pharrell, Iranian President React to Tehran 'H...","pharrell, iranian president react to tehran '...",Entertainment,pharrel iranian presid react to tehran happi a...
2,60707,Wildlife service seeks comments,the u.s. fish and wildlife service has reopen...,Technology,wildlif servic seek comment wildlif servic see...
3,27883,Facebook teams up with Storyful to launch 'FB ...,the very nature of social media means it is o...,Technology,facebook team up with story to launch fb newsw...
4,169596,Caesars plans US$880 mln New York casino,caesars plans us$880 mln new york casino jul ...,Business,caesar plan us 880 mln new york casino caesar ...


In [None]:
df_test['Title Content'] = df_test['Title Content'].apply(lambda x: word_tokenize(x))
df_test['Title Content'] = df_test['Title Content'].apply(lambda x: [porter.stem(word) for word in x])
df_test['Title Content'] = df_test['Title Content'].apply(lambda x: ' '.join(str(word) for word in x))

df_test.head()

Unnamed: 0,Id,Title,Content,Title Content
0,262120,Tracy Morgan upgraded to fair condition after ...,actor and comedian tracy morgan has been upgr...,traci morgan upgrad to fair condit after crash...
1,175132,Smartphones Weigh on Samsung Electronics as Gu...,samsung electronics co ltd on tuesday issued u...,smartphon weigh on samsung electron as guidanc...
2,218739,FBI denies fumbling testimony on 'X-Men' direc...,michael f. egan iii said in a press conferenc...,fbi deni fumbl testimoni on x men director bry...
3,253483,Bachelorette 2014 Spoilers: Week 3 Recap ??? E...,i am having mixed emotions for what is about ...,bachelorett 2014 spoiler week 3 recap eric hil...
4,224109,Barack Obama honours Frankie Knuckles in lette...,u.s. president barack obama has paid a specia...,barack obama honour franki knuckl in letter to...


In [None]:
df.Label.unique()

array(['Entertainment', 'Technology', 'Business', 'Health'], dtype=object)

## Creating Bag of Words

>Appending some extra words in the stopwords' list provided by nltk library. Creating a representative list of stopwords for the dataset at hand, helps to suppress as much as possible the noise of words that do not really play a key role in the classification process (chances are that they are just indifferent to the article's meaning and as the result to the classification result)



In [None]:
stop_words = nltk.corpus.stopwords.words('english')
stop_words.append("said")
stop_words.append("say")
stop_words.append("says")
stop_words.append("one")
stop_words.append("also")
stop_words.append("may")
stop_words.append("will")
stop_words.append("seem")
stop_words.append("many")
stop_words.append("much")
stop_words.append("think")
stop_words.append("like")
stop_words.append("would")
stop_words.append("even")
stop_words.append("well")
stop_words.append("time")

## Tfidf Vectorizer

We use the tfidf vectorizer to create the bag of words. TF-IDF stands for Term Frequency-Inverse Document Frequency. The Bag of Words (BoW) model is the simplest form of text representation in numbers. Like the term itself, we can represent a sentence as a bag of words vector (a string of numbers). A simple Bag of Words (like one created with the use of CountVectorizer class), just creates a set of vectors containing the count of word occurrences in the corpus given. On the other hand, the TF-IDF model contains information on the more important words and the less important ones as well. In this case we create select to contain the size of the information to 10000 features. This number comes after multiple trials with various different numbers of features. But if we either defined more (e.g. max_features=20000 etc.) or less (e.g. max_features=2000 etc.), during the classification the cross validation metrics were significantly deteriorating.

In [None]:
vectorizer=TfidfVectorizer(max_features=10000, stop_words=stop_words, ngram_range=(1, 2))

temp = df['Title Content']
labels = df['Label']

vectorized_content = vectorizer.fit_transform(temp)
train_bow = vectorized_content
train_bow

<111795x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 18064785 stored elements in Compressed Sparse Row format>

In [None]:
temp = df_test['Title Content']

vectorized_test_content = vectorizer.fit_transform(temp)
test_bow = vectorized_test_content
test_bow

<47912x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 7751755 stored elements in Compressed Sparse Row format>

**Label encoding**

In [None]:
from sklearn import preprocessing
label_enc = preprocessing.LabelEncoder()
labels = label_enc.fit_transform(df['Label'])
labels

array([1, 1, 3, ..., 2, 1, 1])

Useful method declarations for csv file creation and statistics calculation.

In [None]:
def create_file(data, prediction, filename):
  res_df = pd.DataFrame(data)
  res_df['Predicted'] = label_enc.inverse_transform(prediction)
  res_df.to_csv(filename+".csv", columns=['Id', 'Predicted'], index=False)
  return res_df

res_mean = pd.DataFrame([])
res_mean = res_mean.rename_axis('Statistic Measure', axis=1)
columnNum=0
def calculate_statistic_metrics(res, res_mean, columnName, columnNum):
  temp_df = pd.DataFrame([])
  accuracy_mean = pd.Series(res['test_accuracy'].mean(), name='Accuracy')
  precision_mean = pd.Series(res['test_precision_macro'].mean(), name='Precision')
  recall_mean = pd.Series(res['test_recall_macro'].mean(), name='Recall')
  F1_mean = pd.Series(res['test_f1_macro'].mean(), name='F-measure')
  temp_df = temp_df.append(accuracy_mean)
  temp_df = temp_df.append(precision_mean)
  temp_df = temp_df.append(recall_mean)
  temp_df = temp_df.append(F1_mean)
  res_mean[columnName] = temp_df[0]
  res_mean = res_mean.rename(columns={columnNum:columnName})
  return res_mean

#**Classification Task**
For each algorithm's parameters we conducted multiple experiments in order to determine the best combination with the data.


---

##**Classification Algorithms + Bag Of Words**

### **SVM**

>For the SVM i used the sklearn function [LinearSVC](https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html). LinearSVC is similar to SVC with parameter kernel=’linear’, but implemented in terms of liblinear rather than libsvm, so it has more flexibility in the choice of penalties and loss functions and also scales better to large numbers of data samples. This class supports both dense and sparse input and the multiclass support is handled according to a one-vs-the-rest scheme.

### **5-Fold Cross Validation**
>According to the 5-Fold cross validation over the test data, the linear svm model given the BoW we created before, scores high in all the metrics we consider. This cross validation indicates that our trained model should predict the class of each article correctly (True-Positive) 96,8% of the times, while it presents to have really low instances of False-Positive class predictions. Overall the model's accuracy reaches the score of 97%.

In [None]:
from sklearn.svm import SVC, LinearSVC
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_validate

In [None]:
X=train_bow
y=labels
clf = LinearSVC(random_state=42, tol=1e-5, C=0.25)

In [None]:
scoring=['accuracy','precision_macro', 'recall_macro', 'f1_macro']
scores = cross_validate(clf, X, y, cv=5, n_jobs=4, scoring=scoring)
svm_res = pd.DataFrame.from_dict(scores)
res_mean = calculate_statistic_metrics(svm_res, res_mean, 'SVM (BoW)', columnNum)
display(res_mean)
res_mean.to_csv('evaluationResults.csv')

Statistic Measure,SVM (BoW)
Accuracy,0.970008
Precision,0.968411
Recall,0.965929
F-measure,0.967147


Make prediction for the test data with svm model

In [None]:
clf.fit(X, y)
y_pred_test = clf.predict(test_bow)
svm_bow_pred_df = create_file(df_test, y_pred_test, 'svm_bow_pred')
display(svm_bow_pred_df)

Unnamed: 0,Id,Title,Content,Title Content,Predicted
0,262120,Tracy Morgan upgraded to fair condition after ...,actor and comedian tracy morgan has been upgr...,traci morgan upgrad to fair condit after crash...,Technology
1,175132,Smartphones Weigh on Samsung Electronics as Gu...,samsung electronics co ltd on tuesday issued u...,smartphon weigh on samsung electron as guidanc...,Entertainment
2,218739,FBI denies fumbling testimony on 'X-Men' direc...,michael f. egan iii said in a press conferenc...,fbi deni fumbl testimoni on x men director bry...,Entertainment
3,253483,Bachelorette 2014 Spoilers: Week 3 Recap ??? E...,i am having mixed emotions for what is about ...,bachelorett 2014 spoiler week 3 recap eric hil...,Technology
4,224109,Barack Obama honours Frankie Knuckles in lette...,u.s. president barack obama has paid a specia...,barack obama honour franki knuckl in letter to...,Entertainment
...,...,...,...,...,...
47907,50348,"BMW, Tesla meet to discuss standardizing elect...","june 16, 2014 by edward taylor reutersan emplo...",bmw tesla meet to discuss standard electr car ...,Technology
47908,255044,Harrison Ford has been filming the seventh Sta...,he may have helped save the galaxy from the ev...,harrison ford ha been film the seventh star wa...,Business
47909,66502,"It's Games, Games, Games As Microsoft Plans To...",less than three months after microsoft had a ...,it s game game game as microsoft plan to close...,Technology
47910,10319,App Detail » Microsoft Excel for iPad,app description *** excel is ready for ipad p...,app detail microsoft excel for ipad app detail...,Technology


### **Random Forest**

>A random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting. We use the classifier with its default parameters.

###**5-Fold Cross Validation**

>

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report

In [None]:
X=train_bow
y=labels

clf_Rf = RandomForestClassifier(n_jobs=-1, n_estimators=100)

In [None]:
scoring=['accuracy','precision_macro', 'recall_macro', 'f1_macro']
scores = cross_validate(clf_Rf, X, y, cv=5, n_jobs=4, scoring=scoring)
rf_res = pd.DataFrame.from_dict(scores)
res_mean = calculate_statistic_metrics(rf_res, res_mean, 'Random Forest (BoW)', columnNum)
display(res_mean)
res_mean.to_csv('evaluationResults')

Statistic Measure,SVM (BoW),Random Forest (BoW)
Accuracy,0.970008,0.939559
Precision,0.968411,0.940052
Recall,0.965929,0.927771
F-measure,0.967147,0.933616


Make label prediction for test data with bow

In [None]:
clf_Rf.fit(X, y)
y_pred = clf_Rf.predict(test_bow)
rf_bow_pred_df = create_file(df_test, y_pred, 'rf_bow_pred')
display(rf_bow_pred_df)

Unnamed: 0,Id,Title,Content,Title Content,Predicted
0,262120,Tracy Morgan upgraded to fair condition after ...,actor and comedian tracy morgan has been upgr...,traci morgan upgrad to fair condit after crash...,Entertainment
1,175132,Smartphones Weigh on Samsung Electronics as Gu...,samsung electronics co ltd on tuesday issued u...,smartphon weigh on samsung electron as guidanc...,Technology
2,218739,FBI denies fumbling testimony on 'X-Men' direc...,michael f. egan iii said in a press conferenc...,fbi deni fumbl testimoni on x men director bry...,Technology
3,253483,Bachelorette 2014 Spoilers: Week 3 Recap ??? E...,i am having mixed emotions for what is about ...,bachelorett 2014 spoiler week 3 recap eric hil...,Entertainment
4,224109,Barack Obama honours Frankie Knuckles in lette...,u.s. president barack obama has paid a specia...,barack obama honour franki knuckl in letter to...,Technology
...,...,...,...,...,...
47907,50348,"BMW, Tesla meet to discuss standardizing elect...","june 16, 2014 by edward taylor reutersan emplo...",bmw tesla meet to discuss standard electr car ...,Business
47908,255044,Harrison Ford has been filming the seventh Sta...,he may have helped save the galaxy from the ev...,harrison ford ha been film the seventh star wa...,Business
47909,66502,"It's Games, Games, Games As Microsoft Plans To...",less than three months after microsoft had a ...,it s game game game as microsoft plan to close...,Business
47910,10319,App Detail » Microsoft Excel for iPad,app description *** excel is ready for ipad p...,app detail microsoft excel for ipad app detail...,Business


#**Classification + BoW + SVD**


---


### SVD
Dimensionality reduction using [Truncated SVD](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html) (aka LSA).
This transformer performs linear dimensionality reduction by means of truncated singular value decomposition (SVD). This estimator does not center the data before computing the singular value decomposition. This means it can work with sparse matrices efficiently (*Note: tfidf returns a sparse matrix*).

In [None]:
from sklearn.decomposition import TruncatedSVD

X=train_bow
y=labels
Z=test_bow

svd = TruncatedSVD(n_components=100, random_state=42)
train_bow_svd = svd.fit_transform(X,y)
test_bow_svd = svd.fit_transform(Z)

print(y.shape)
print(train_bow_svd.shape)
print(test_bow_svd.shape)

(111795,)
(111795, 100)
(47912, 100)


### **SVM**

>**5-Fold Cross Validation:** In this experiment, the accuracy of the model from the cross validation is ~92%. Its precision is calculated to be ~91.8% (the ratio of correctly predicted observations to the total predicted observations).  The recall is ~90.6%, meaning that this quantum of predicted observations were actually a correct prediction. Finally the f-measure is ~91.2% (the average of precision and recall). 
This experiment, after being trained with all the train data, it gives a much better score (Accuracy: ~82%). 


In [None]:
clf = LinearSVC(random_state=42, tol=1e-5, C=0.25)
scoring=['accuracy','precision_macro', 'recall_macro', 'f1_macro']
scores = cross_validate(clf, train_bow_svd, y, cv=5, n_jobs=4, scoring=scoring)
svm_res_svd = pd.DataFrame.from_dict(scores)
res_mean = calculate_statistic_metrics(svm_res_svd, res_mean, 'SVM (SVD)', columnNum)
display(res_mean)

Statistic Measure,SVM (BoW),Random Forest (BoW),SVM (SVD)
Accuracy,0.970008,0.939559,0.922054
Precision,0.968411,0.940052,0.918617
Recall,0.965929,0.927771,0.906882
F-measure,0.967147,0.933616,0.912456


In [None]:
#Predict for test set
y=labels
clf.fit(train_bow_svd,y)
y_pred_test = clf.predict(test_bow_svd)
SVM_pred_svd = create_file(df_test, y_pred_test, 'svm_bow_svd_pred')
SVM_pred_svd

Unnamed: 0,Id,Title,Content,Title Content,Predicted
0,262120,Tracy Morgan upgraded to fair condition after ...,actor and comedian tracy morgan has been upgr...,traci morgan upgrad to fair condit after crash...,Entertainment
1,175132,Smartphones Weigh on Samsung Electronics as Gu...,samsung electronics co ltd on tuesday issued u...,smartphon weigh on samsung electron as guidanc...,Business
2,218739,FBI denies fumbling testimony on 'X-Men' direc...,michael f. egan iii said in a press conferenc...,fbi deni fumbl testimoni on x men director bry...,Entertainment
3,253483,Bachelorette 2014 Spoilers: Week 3 Recap ??? E...,i am having mixed emotions for what is about ...,bachelorett 2014 spoiler week 3 recap eric hil...,Entertainment
4,224109,Barack Obama honours Frankie Knuckles in lette...,u.s. president barack obama has paid a specia...,barack obama honour franki knuckl in letter to...,Entertainment
...,...,...,...,...,...
47907,50348,"BMW, Tesla meet to discuss standardizing elect...","june 16, 2014 by edward taylor reutersan emplo...",bmw tesla meet to discuss standard electr car ...,Technology
47908,255044,Harrison Ford has been filming the seventh Sta...,he may have helped save the galaxy from the ev...,harrison ford ha been film the seventh star wa...,Entertainment
47909,66502,"It's Games, Games, Games As Microsoft Plans To...",less than three months after microsoft had a ...,it s game game game as microsoft plan to close...,Technology
47910,10319,App Detail » Microsoft Excel for iPad,app description *** excel is ready for ipad p...,app detail microsoft excel for ipad app detail...,Technology


## **Random Forest**
>**5-Fold Cross Validation:** In this experiment, the accuracy of the model from the cross 
validation is ~95.1%. Its precision is calculated to be ~94.9% (the ratio of correctly 
predicted observations to the total predicted observations).  The recall is ~94.1%, 
meaning that this quantum of predicted observations were actually a correct prediction. 
Finally the f-measure is ~94.5% (the average of precision and recall). 
This experiment, after being trained with all the train data, it gives a much better score (Accuracy: ~86%). This is also the experiment that gave us the best prediction results in kaggle leaderboard.


In [None]:
y=labels
clf_Rf = RandomForestClassifier(n_jobs=-1)
scoring=['accuracy','precision_macro', 'recall_macro', 'f1_macro']
scores = cross_validate(clf_Rf, train_bow_svd, y, cv=5, n_jobs=4, scoring=scoring)
rf_res_svd = pd.DataFrame.from_dict(scores)
res_mean = calculate_statistic_metrics(rf_res_svd, res_mean, 'Random Forest (SVD)', columnNum)
display(res_mean)

Statistic Measure,SVM (BoW),Random Forest (BoW),SVM (SVD),Random Forest (SVD)
Accuracy,0.970008,0.939559,0.922054,0.951286
Precision,0.968411,0.940052,0.918617,0.949943
Recall,0.965929,0.927771,0.906882,0.941382
F-measure,0.967147,0.933616,0.912456,0.945504


In [None]:
#Predict for test set 
y=labels
clf_Rf = RandomForestClassifier(n_jobs=-1)
clf_Rf.fit(train_bow_svd,y)
y_pred_test = clf_Rf.predict(test_bow_svd)
rf_pred_svd = create_file(df_test, y_pred_test, 'rf_bow_svd_pred')
display(rf_pred_svd)

Unnamed: 0,Id,Title,Content,Title Content,Predicted
0,262120,Tracy Morgan upgraded to fair condition after ...,actor and comedian tracy morgan has been upgr...,traci morgan upgrad to fair condit after crash...,Entertainment
1,175132,Smartphones Weigh on Samsung Electronics as Gu...,samsung electronics co ltd on tuesday issued u...,smartphon weigh on samsung electron as guidanc...,Business
2,218739,FBI denies fumbling testimony on 'X-Men' direc...,michael f. egan iii said in a press conferenc...,fbi deni fumbl testimoni on x men director bry...,Entertainment
3,253483,Bachelorette 2014 Spoilers: Week 3 Recap ??? E...,i am having mixed emotions for what is about ...,bachelorett 2014 spoiler week 3 recap eric hil...,Entertainment
4,224109,Barack Obama honours Frankie Knuckles in lette...,u.s. president barack obama has paid a specia...,barack obama honour franki knuckl in letter to...,Entertainment
...,...,...,...,...,...
47907,50348,"BMW, Tesla meet to discuss standardizing elect...","june 16, 2014 by edward taylor reutersan emplo...",bmw tesla meet to discuss standard electr car ...,Technology
47908,255044,Harrison Ford has been filming the seventh Sta...,he may have helped save the galaxy from the ev...,harrison ford ha been film the seventh star wa...,Entertainment
47909,66502,"It's Games, Games, Games As Microsoft Plans To...",less than three months after microsoft had a ...,it s game game game as microsoft plan to close...,Technology
47910,10319,App Detail » Microsoft Excel for iPad,app description *** excel is ready for ipad p...,app detail microsoft excel for ipad app detail...,Technology


# **My method - KNN**


---


>[ΚΝΝ](http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html) is a simple classifier. In our case it scores about the same as the SVM+BoW+SVD experiment. We used it with its default parameters were k=5.

>**5-Fold Cross Validation:** In cross validation for KNN we get ~96,4 accuracy. The model’s precision is ~96.2% and the recall ~95.9%. Finally the f-measure is 96.1%. Pretty decent results for the classifier. 


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [None]:
X = train_bow_svd
y = labels

clf_nb = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)

In [None]:
scoring=['accuracy','precision_macro', 'recall_macro', 'f1_macro']
scores = cross_validate(clf_nb, X, y, cv=5, n_jobs=-1, scoring=scoring)
nb_res = pd.DataFrame.from_dict(scores)
res_mean = calculate_statistic_metrics(nb_res, res_mean, 'My Method', columnNum)
res_mean.to_csv('evaluationResults')
display(res_mean)

Statistic Measure,SVM (BoW),Random Forest (BoW),SVM (SVD),Random Forest (SVD),My Method
Accuracy,0.970008,0.939559,0.922054,0.951286,0.964757
Precision,0.968411,0.940052,0.918617,0.949943,0.962784
Recall,0.965929,0.927771,0.906882,0.941382,0.959727
F-measure,0.967147,0.933616,0.912456,0.945504,0.961227


In [None]:
clf_nb.fit(X,y)
y_pred_test = clf_nb.predict(test_bow_svd)
rf_pred_svd = create_file(df_test, y_pred_test, 'knn_pred')
display(rf_pred_svd)

Unnamed: 0,Id,Title,Content,Title Content,Predicted
0,262120,Tracy Morgan upgraded to fair condition after ...,actor and comedian tracy morgan has been upgr...,traci morgan upgrad to fair condit after crash...,Entertainment
1,175132,Smartphones Weigh on Samsung Electronics as Gu...,samsung electronics co ltd on tuesday issued u...,smartphon weigh on samsung electron as guidanc...,Business
2,218739,FBI denies fumbling testimony on 'X-Men' direc...,michael f. egan iii said in a press conferenc...,fbi deni fumbl testimoni on x men director bry...,Entertainment
3,253483,Bachelorette 2014 Spoilers: Week 3 Recap ??? E...,i am having mixed emotions for what is about ...,bachelorett 2014 spoiler week 3 recap eric hil...,Entertainment
4,224109,Barack Obama honours Frankie Knuckles in lette...,u.s. president barack obama has paid a specia...,barack obama honour franki knuckl in letter to...,Entertainment
...,...,...,...,...,...
47907,50348,"BMW, Tesla meet to discuss standardizing elect...","june 16, 2014 by edward taylor reutersan emplo...",bmw tesla meet to discuss standard electr car ...,Technology
47908,255044,Harrison Ford has been filming the seventh Sta...,he may have helped save the galaxy from the ev...,harrison ford ha been film the seventh star wa...,Entertainment
47909,66502,"It's Games, Games, Games As Microsoft Plans To...",less than three months after microsoft had a ...,it s game game game as microsoft plan to close...,Technology
47910,10319,App Detail » Microsoft Excel for iPad,app description *** excel is ready for ipad p...,app detail microsoft excel for ipad app detail...,Technology
