# News articles data analysis.

### Notes:
- Before we make our wordclouds and apply classification and clustering methods to our data,
    we make sure to run *generate_train_test_sets.ipynb*, in order to create the train and test sets.
  

## Preparations

### Importing Data

In [1]:
import pandas as pd
from matplotlib import pyplot as plt

# df = pd.read_csv("data.tsv", sep='\t', encoding = 'ANSI')

In [2]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as stop_words



from sklearn import datasets, svm

# Natural language processing kit imports.
from nltk.stem import WordNetLemmatizer

### Function prototype for text preprocessing.

<span style="color:DeepPink">**preprocess_article**</span>**(text)**  
&nbsp;&nbsp;Removes special characters from a given string object, removes stop words and lematizes words using WordNetLematizer().  
&nbsp;&nbsp;&nbsp;**Parameters: &nbsp;&nbsp;&nbsp;text : str**  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
String object to process. 

&nbsp;&nbsp;&nbsp;**Returns: &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;text : str**  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
Lowercase lematized string object without stopwords and several special characters.

In [3]:
# import re

In [4]:
# stop_words = list(stop_words)

""" In previous version of our project, the wordclouds below showed that 'said' and 'say' words
    appear the most in the data, so we decided to remove them as they has no valuable meaning. """
# stop_words.extend(['said','say'])
wordnet_lemmatizer = WordNetLemmatizer()

""" Make sure that the text parameter and return variable are of string type. """
def preprocess_article(text: str) -> str:
    # Remove newlines and \r characters.
    text = text.replace('\n', ' ')
    text = text.replace('\r', ' ')
    
    # Remove quotes
    text = text.replace('"', ' ')
   
    # Convert text to lowercase.
    text = text.lower()
    
    # Remove punctuation and many special characters.
    text = text.translate(str.maketrans('', '', '!?:\';.,[]()@#$%^&*£'))
   
    # Remove terminating 's characters.
    text = text.replace("'s", "")

    # Remove stop words. Note: do this first and then lemmatize because lemmatizing
    # can change words like 'has' to 'ha'.
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    # Lematize text with WordNetLemmatizer().
    text = ' '.join([wordnet_lemmatizer.lemmatize(word) for word in text.split(' ')])
    
    # Remove all words with numbers in them (ie. 400bn, 512kbps etc.) .
    text = re.sub(r'\w*\d\w*', '', text).strip()
    
    return text

### Article content preprocessing
Apply the above function to the content column. It will result it better wordclouds and produce better classification results.

In [5]:
# df['content'] = df['content'].apply(preprocess_article)
# df.head()

# 1. Word Clouds per Category

For the wordclouds we need all the data we've got.

In [6]:
from wordcloud import WordCloud

This function takes as parameter a string representing one of the dataframe's categories,
and returns all the articles' content in that category as a string.

In [7]:
def choose_category_content(category: str) -> str:
    articles_series = df[df['category'] == category]['content']
    words = ' '.join(articles_series)
    return words

def wordcloud_gen(category):
    wordcloud = WordCloud(
        width = 1600,
        height = 1000,
        background_color = "white",
        min_font_size = 10).generate(choose_category_content(category))

    plt.figure(figsize = (16, 10), facecolor = None) 
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()

Build wordclouds

### Business Word Cloud

In [8]:

# entertainment_wc = WordCloud(width=1920, background_color = 'white', height=1080).generate(choose_category_content('entertainment'))

# politics_wc = WordCloud(width=1920, background_color = 'white', height=1080).generate(choose_category_content('politics'))

# sport_wc = WordCloud(width=1920, background_color = 'white', height=1080).generate(choose_category_content('sport'))

# tech_wc = WordCloud(width=1920, background_color = 'white', height=1080).generate(choose_category_content('tech'))

In [9]:
# wordcloud_gen("business")

### Entertainment Word Cloud

In [10]:
# wordcloud_gen("entertainment")

### Politics Word Cloud

In [11]:
# wordcloud_gen("politics")

### Sport Word Cloud

In [12]:
# wordcloud_gen("sport")

### Tech Word Cloud

In [13]:
# wordcloud_gen("tech")

## Just a few worth-reading observations regarding the wordclouds
First of all, most of the words in each word clouds are pretty relevant to the respective categories.
Another interesting thing is the word **said**. One quick logical thought is that it would alter the classification results, either little or more, it doesn't matter.

We can prove that it won't, by doing a chi-squared test on our data.

Chi-squared test can measure 


In [14]:
# df['category_id'] = df.category.factorize()[0]

In [15]:
# tf_idf = TfidfVectorizer(max_features = 100, ngram_range = (1, 2))

# features = tf_idf.fit_transform(df.content).toarray()
# features.shape

In [16]:
# from sklearn.feature_selection import chi2
# N = 2
# labels = df.category_id
# for category, category_id in sorted(category_to_id.items()):
#     features_chi2 = chi2(features, labels == category_id)
#     indices = np.argsort(features_chi2[0])
#     feature_names = np.array(tf_idf.get_feature_names())[indices]
#     unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
#     bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
#     print("# '{}':".format(category))
#     print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
#     print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))

# 2 Classification



### 2.1 Load our test and train datasets.

In [41]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import cross_val_score, cross_validate, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.metrics import make_scorer,    \
                            accuracy_score, \
                            precision_score,\
                            recall_score,   \
                            f1_score,       \
                            classification_report

train_set = pd.read_csv("train_set.tsv", sep='\t', encoding = 'ANSI')
test_set = pd.read_csv("test_set.tsv", sep='\t', encoding = 'ANSI')
test_labels = pd.read_csv("test_labels.tsv", sep='\t', encoding = 'ANSI')

In [42]:
train_content = train_set['content']
train_labels = train_set['category']

In [43]:
test_content = test_set['content']
test_labels = test_labels['category']

### Encoding the train and test labels using sklearn.preprocessing.LabelEncoder

For the categories we will use the LabelEncoder.

In [45]:
le = LabelEncoder()
le.fit(train_labels)
cv_train_labels = le.transform(train_labels)
cv_test_labels = le.transform(test_labels)

### Trying pipelines

In [46]:

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', SVC(kernel='rbf', C=1000, gamma=0.0001))])

text_clf.fit(train_content, train_labels)
pred = text_clf.predict(test_content)
rep = classification_report(test_labels, pred)

In [61]:
text_clf.score

<function sklearn.pipeline.Pipeline.score(self, X, y=None, sample_weight=None)>

#### 2.2.a Using CountVectorizer

In [23]:
count_vectorizer = CountVectorizer()

cv_train_content = count_vectorizer.fit_transform(train_content)
cv_test_content = count_vectorizer.fit_transform(test_content)

In [25]:


svm_clf = SVC(kernel='rbf',C=1000, gamma=0.0001)

### Trying grid search

In [26]:
# svm_clf = SVC()

# scoring = ['precision_macro', 'recall_macro', 'f1_macro', 'accuracy']

# param_grid = {
#             'kernel': ['rbf', 'linear'],
#             'C': [1e3, 5e3, 1e4, 5e4, 1e5],
#             'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
# }
# clf = GridSearchCV(svm_clf, param_grid)

In [27]:
# svm_clf.fit(cv_train_content, cv_train_labels)

SVC(C=1000, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [28]:
# scoring = ['precision_macro', 'recall_macro', 'f1_macro', 'accuracy']

### Try cross validation

In [29]:
# score = cross_validate(svm_clf, cv_train_content, cv_train_labels, scoring=scoring)

# print("precision:", score['test_precision_macro'].mean())
# print("recall:",score['test_recall_macro'].mean())
# print("f1:",score['test_f1_macro'].mean())
# print("accuracy:",score['test_accuracy'].mean())

In [69]:
# svd = TruncatedSVD(n_components=1000, random_state=123)

# cv_test_content = svd.fit_transform(cv_test_content)
predicted_labels = svm_clf.predict(cv_test_content)

ValueError: X.shape[1] = 14749 should be equal to 26463, the number of features at training time

In [None]:
pred_rep = accuracy_score(cv_test_labels, predicted_labels)


In [None]:
print(pred_rep)