In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # plotting graphs
import sklearn
%matplotlib inline
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input/ai-academy-intermediate-class-competition-1"))

# Any results you write to the current directory are saved as output.

## Load and Explore the Data

In [None]:
TRAIN_PATH = os.path.join("../input/ai-academy-intermediate-class-competition-1", "BBC News Train.csv")

#Load the data using pandas : Create a DataFrame named df, that contains the training data 
df = pd.read_csv(TRAIN_PATH)

In [None]:
# List first 5 entries in dataframe to make sure it was loaded properly
# and review the various colums in the dataframe

df.head()

In [None]:
# Associate Category names with numerical index and save it in new column category_id
df['category_id'] = df['Category'].factorize()[0]

#View first 10 entries of category_id, as a sanity check
df['category_id'][0:10]

In [None]:
# Create a new pandas dataframe "category_id_df", which only has unique Categories, also sorting this list in order of category_id values
category_id_df = df[['Category', 'category_id']].drop_duplicates().sort_values('category_id')

In [None]:
category_id_df

In [None]:
# Create a dictionary ( python datastructure - like a lookup table) that 
# can easily convert category names into category_ids and vice-versa
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'Category']].values)

In [None]:
id_to_category

In [None]:
# Pick 5 random samples from the dataframe
df.sample(5, random_state=0)

In [None]:
# Group the dataframe by categories and count items ( number of news articles) in each category
df.groupby('Category').category_id.count()


In [None]:
#Plot the distribution of news articles by category
df.groupby('Category').category_id.count().plot.bar(ylim=0)

## Convert words in the news articles into numerical features using tfdif 

sklearn.feature_extraction.text.TfidfVectorizer will be used to calculate a tf-idf vector for each of our documents. 
Note that we are passing a number of parameters to this class:

*  **sublinear_df** is set to True to use a logarithmic form for frequency, to give diminishing returns as the frequency of a word increases. This is usually preferable for a number of reasons, one of which being Zipf's Law.
*  **min_df** is the minimum numbers of documents a word must be present in to be kept, and we are setting it to 5. This is to avoid rare words, which drastically increase the size of our features and might cause overfitting.
*  **norm** is set to l2, to ensure all our feature vectors have a euclidian norm of 1. This is helpful for visualizing these vectors, and can also improve (or deteriorate) the performance of some models.
* **encoding** is set to latin-1 which is used by our input text.
*  **ngram_range** is set to (1, 2) to indicate that we want to consider both unigrams and bigrams, or in other terms: we want to consider single words ("prices", "player") and pairs of words ("stock prices", "football player").
*  **stop_words** is set to "english" to remove all common pronouns ("a", "the", ...) and further reduce the number of noisy features.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

features = tfidf.fit_transform(df.Text).toarray() # Remaps the words in the 1490 articles in the text column of 
                                                  # data frame into features (superset of words) with an importance assigned 
                                                  # based on each words frequency in the document and across documents

labels = df.category_id                           # represents the category of each of the 1490 articles


In [None]:
#Get a feel of the features identified by tfidf
features.shape # How many features are there ? 

In [None]:
features

In [None]:
# Remember the dictionary created to map category names to a number ? 
category_to_id.items()

In [None]:
# The sorted function Converts dictionary items into a (sorted) list. 
# dictionary is not an iterable type
# In subsequent steps - We will use this list to iterate over the categories
sorted(category_to_id.items())

In [None]:
# Use chi-square analysis to find corelation between features (importantce of words) and labels(news category) 

from sklearn.feature_selection import chi2

N = 3  # We are going to look for top 3 categories

#For each category, find words that are highly corelated to it
for Category, category_id in sorted(category_to_id.items()):
  features_chi2 = chi2(features, labels == category_id)                   # Do chi2 analyses of all items in this category
  indices = np.argsort(features_chi2[0])                                  # Sorts the indices of features_chi2[0] - the chi-squared stats of each feature
  feature_names = np.array(tfidf.get_feature_names())[indices]            # Converts indices to feature names ( in increasing order of chi-squared stat values)
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]         # List of single word features ( in increasing order of chi-squared stat values)
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]          # List for two-word features ( in increasing order of chi-squared stat values)
  print("# '{}':".format(Category))
  print("  . Most correlated unigrams:\n       . {}".format('\n       . '.join(unigrams[-N:]))) # Print 3 unigrams with highest Chi squared stat
  print("  . Most correlated bigrams:\n       . {}".format('\n       . '.join(bigrams[-N:]))) # Print 3 bigrams with highest Chi squared stat
#for each category, do chi2 analysis
#print out words more likely and certain pairs of words

In [None]:
features_chi2
#chi2 stats, probability that random distribution w/ samples given has this

## Use t-SNE : A  Dimensionality reduction  technique to visualize ( in 2 dimensions), a high dimensional space
### t-Distributed Stochastc neighbor Embedding : Keeps similar instances close and dissimilar instances apart

In [None]:
from sklearn.manifold import TSNE
#remap words into 2 dimensional space
#from 9000 dimensions,compressed to 2
#manifold and projection
#manifold - keep similar items close to each other in lower dimensions
# Sampling a subset of our dataset because t-SNE is computationally expensive
SAMPLE_SIZE = int(len(features) * 0.3)
np.random.seed(0)
indices = np.random.choice(range(len(features)), size=SAMPLE_SIZE, replace=False)          # Randomly select 30 % of samples
projected_features = TSNE(n_components=2, random_state=0).fit_transform(features[indices]) # Array of all projected features of 30% of Randomly chosen samples 
#take 30% of features and map in 2 dimensional matrix

In [None]:
type(projected_features)

In [None]:
my_id = 0 # Select a category_id
projected_features[(labels[indices] == my_id).values]

### Plot the 2-dimensional ditribution identified by  t-SNE

In [None]:
colors = ['pink', 'green', 'midnightblue', 'orange', 'darkgrey']

# Find points belonging to each category and plot them
for category, category_id in sorted(category_to_id.items()):
    points = projected_features[(labels[indices] == category_id).values]
    plt.scatter(points[:, 0], points[:, 1], s=30, c=colors[category_id], label=category)
plt.title("tf-idf feature vector for each article, projected on 2 dimensions.",
          fontdict=dict(fontsize=15))
plt.legend()

# Model Training and Evaluation
### We will try 3 different classification models on the data : 
            Logistic Regression
            RandomForestClassifier
            MultinomialNB ( Naive Bayes) - makes assumption that each category is seperate from other

In [None]:
features.shape

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import cross_val_score



models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]


### Create a data frame that will store the results of various models.
Each model will be run 5 times with different test sets of 20%

In [None]:
CV = 5  # Cross Validate with 5 different folds of 20% data ( 80-20 split with 5 folds )

#Create a data frame that will store the results for all 5 trials of the 3 different models
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = [] # Initially all entries are empty

## Run each Algorithm 5 times and store accuracy results in "entries"

In [None]:
#For each Algorithm 
for model in models:
  model_name = model.__class__.__name__
  # create 5 models with different 20% test sets, and store their accuracies
  accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
  # Append all 5 accuracies into the entries list ( after all 3 models are run, there will be 3x5 = 15 entries)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))

### Store results in the results dataframe

In [None]:
# Store the entries into the results dataframe and name its columns    
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

### Use seaborn to plot the results

seaborn is a library that runs on top of matplotlib and makes drawing fancier plots easier


In [None]:
import seaborn as sns

sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df, 
              size=8, jitter=True, edgecolor="gray", linewidth=2)

In [None]:
# Mean accuracy of each algorithm
cv_df.groupby('model_name').accuracy.mean()

In [None]:
cv_df

# Model fit Logistic regression with 33% of data randomly chosen for test

In [None]:
from sklearn.model_selection import train_test_split

model = LogisticRegression(random_state=0)

#Split Data 
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, df.index, test_size=0.33, random_state=0)

#Train Algorithm
model.fit(X_train, y_train)

# Make Predictions
y_pred_proba = model.predict_proba(X_test)
y_pred = model.predict(X_test)

### Print confusion matrix in test data using seaborn

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

conf_mat = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=category_id_df.Category.values, yticklabels=category_id_df.Category.values)
plt.ylabel('Actual')
plt.xlabel('Predicted')

# Optional
### Study the failing scenarios 
### ****Print the cases where article was miscategorized in same way at least 2 or more times

In [None]:
from IPython.display import display

for predicted in category_id_df.category_id:
  for actual in category_id_df.category_id:
    if predicted != actual and conf_mat[actual, predicted] >= 2:
      print("'{}' predicted as '{}' : {} examples.".format(id_to_category[actual], id_to_category[predicted], conf_mat[actual, predicted]))
      display(df.loc[indices_test[(y_test == actual) & (y_pred == predicted)]]['Text'])
      print('')

# Finally - Use all the data to train the model 

In [None]:
model.fit(features, labels)

### Print top 5 words / two-word combos for each Category

In [None]:
# model.coef_ contains the importance of each feature for each category
model.coef_

In [None]:
from sklearn.feature_selection import chi2

N = 5
for Category, category_id in sorted(category_to_id.items()):
  indices = np.argsort(model.coef_[category_id])   # This time using the model co-eficients / weights
  feature_names = np.array(tfidf.get_feature_names())[indices]
  unigrams = [v for v in reversed(feature_names) if len(v.split(' ')) == 1][:N]
  bigrams = [v for v in reversed(feature_names) if len(v.split(' ')) == 2][:N]
  print("# '{}':".format(Category))
  print("  . Top unigrams:\n       . {}".format('\n       . '.join(unigrams)))
  print("  . Top bigrams:\n       . {}".format('\n       . '.join(bigrams)))

In [None]:
texts = ["Hooli stock price soared after a dip in PiedPiper revenue growth.",
         "Captain Tsubasa scores a magnificent goal for the Japanese team.",
         "Merryweather mercenaries are sent on another mission, as government oversight groups call for new sanctions.",
         "Beyoncé releases a new album, tops the charts in all of south-east Asia!",
         "You won't guess what the latest trend in data analysis is!"]
text_features = tfidf.transform(texts)
predictions = model.predict(text_features)
for text, predicted in zip(texts, predictions):
  print('"{}"'.format(text))
  print("  - Predicted as: '{}'".format(id_to_category[predicted]))
  print("")

1. # Submitting your work 

In [None]:
import os
print(os.listdir("../input/bbc-test"))

In [None]:
TEST_PATH = os.path.join("../input/bbc-test", "BBC News Test.csv")

#Load the data using pandas : Create a DataFrame
test_df = pd.read_csv(TEST_PATH)



In [None]:
test_df

In [None]:
test_df.head()

In [None]:
test_df.Text.tolist()

In [None]:
#translating text column into a list
test_features = tfidf.transform(test_df.Text.tolist())

Y_pred = model.predict(test_features)

Y_pred

In [None]:
# Since all predictions are in terms of "Category IDs (numbers)", need to convert back to Category name
Y_pred_name =[]
for cat_id in Y_pred :
    Y_pred_name.append(id_to_category[cat_id])

In [None]:
Y_pred_name

In [None]:
#Create Submission Dataframe
submission = pd.DataFrame({
        "ArticleId": test_df["ArticleId"],
        "Category": Y_pred_name
    })

In [None]:
submission

In [None]:
# Convert submission dataframe to csv 
# you could use any filename. We choose submission here
submission.to_csv('submission.csv', index=False)

In [None]:
ls