In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

import re
import unicodedata
import nltk

import acquire
import prepare

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier

### Using the inshorts article data, practice using the modeling tools for NLP

**Acquire**

In [2]:
#Acquire the data using the function from the acquire module
categories = ["business", "sports", "technology", "entertainment", "science", "world"]
news_df = acquire.get_all_news_articles(categories)
news_df.head()

Unnamed: 0,title,content,category
0,Customers of banks under moratorium to get ₹5 ...,Finance Minister Nirmala Sitharaman today anno...,business
1,"₹3 lakh fine on Shilpa, Raj & his firm by SEBI...",Securities and Exchange Board of India (SEBI) ...,business
2,Old video of people laughing as Bezos talks ab...,An old video of Amazon Founder Jeff Bezos has ...,business
3,"Apple, Alphabet, Microsoft post combined quart...",Three of the world's largest tech companies - ...,business
4,This information isn't collected by govt: FM o...,"Finance Minister Nirmala Sitharaman replied ""t...",business


**Prepare**

In [3]:
#clean to hold the normalized and tokenized original with the stopwords removed.
news_df['clean'] = news_df['content'].apply(lambda x: prepare.remove_stopwords(prepare.tokenize(prepare.basic_clean(x))))


In [4]:
#Subset the data into a df with just the clean and category columns
df = news_df[['category', 'clean']]
df.head()

Unnamed: 0,category,clean
0,business,finance minister nirmala sitharaman today anno...
1,business,securities exchange board india sebi imposed 3...
2,business,old video amazon founder jeff bezos gone viral...
3,business,three world ' largest tech companies apple goo...
4,business,finance minister nirmala sitharaman replied in...


In [5]:
# We'll use this split function later to create in-sample and out-of-sample datasets for modeling
def split(df, stratify_by=None):
    """
    3 way split for train, validate, and test datasets
    To stratify, send in a column name
    """
    
    
    train, test = train_test_split(df, test_size=.2, random_state=123, stratify=df[stratify_by])
    
    train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train[stratify_by])
    
    return train, validate, test

In [6]:
#Split the data and stratify by category
train, validate, test = split(df, 'category')
train.shape

(81, 2)

In [7]:
# Setup our X variables
X_train = train.clean
X_validate = validate.clean
X_test = test.clean

# Setup our y variables
y_train = train.category
y_validate = validate.category
y_test = test.category

### Modeling

**Logistic Regression**

In [8]:
#Establish baseline
df.category.value_counts()

science          25
sports           25
technology       24
world            24
business         24
entertainment    24
Name: category, dtype: int64

In [9]:
#Create the tfidf vectorizer object
tfidf = TfidfVectorizer()

#Fit the object on the training data
tfidf.fit(X_train)

#Use the object
X_train_vectorized =tfidf.transform(X_train)
X_validate_vectorized =tfidf.transform(X_validate) 
X_test_vectorized =tfidf.transform(X_test)

In [10]:
#Using the vectorized data, make a logistic regression model
lm = LogisticRegression()

#Fit the lm object to the vectorized data
lm.fit(X_train_vectorized, y_train)

LogisticRegression()

In [11]:
#Create corresponding dataframes for the actual values of the categories that correspond to each article
train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

In [12]:
#Form predictions uisng the lm model
train['predicted'] = lm.predict(X_train_vectorized)
validate["predicted"] = lm.predict(X_validate_vectorized)
test['predicted'] = lm.predict(X_test_vectorized)


In [13]:
#Review how the lm model performed on the in-sample data
print(classification_report(train.actual, train.predicted))

               precision    recall  f1-score   support

     business       0.92      0.92      0.92        13
entertainment       1.00      1.00      1.00        13
      science       1.00      1.00      1.00        14
       sports       1.00      1.00      1.00        14
   technology       0.92      0.92      0.92        13
        world       1.00      1.00      1.00        14

     accuracy                           0.98        81
    macro avg       0.97      0.97      0.97        81
 weighted avg       0.98      0.98      0.98        81



In [14]:
#Review how the lm model performed on the out-of-sample data
print(classification_report(validate.actual, validate.predicted))

               precision    recall  f1-score   support

     business       0.83      0.83      0.83         6
entertainment       0.80      0.67      0.73         6
      science       0.80      0.67      0.73         6
       sports       0.80      0.67      0.73         6
   technology       0.33      0.17      0.22         6
        world       0.45      1.00      0.62         5

     accuracy                           0.66        35
    macro avg       0.67      0.67      0.64        35
 weighted avg       0.68      0.66      0.64        35



**Takeaways**
- The Linear Regression model performed better on the train for all articles than the validate.
- Technology and world performed the worst on the validate.

**KNN Model**

In [16]:
#Create the KNN object with a k = 5
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')

#Fit the object to the vectorized training data
knn.fit(X_train_vectorized, y_train)

#Create corresponding dataframes for the actual values of the categories that correspond to each article
train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

#Form predictions uisng the knn model
train['predicted'] = knn.predict(X_train_vectorized)
validate["predicted"] = knn.predict(X_validate_vectorized)
test['predicted'] = knn.predict(X_test_vectorized)

In [17]:
#Review how the knn model performed on the in-sample data
print(classification_report(train.actual, train.predicted))

               precision    recall  f1-score   support

     business       0.67      0.92      0.77        13
entertainment       0.82      0.69      0.75        13
      science       0.72      0.93      0.81        14
       sports       0.87      0.93      0.90        14
   technology       0.67      0.46      0.55        13
        world       0.90      0.64      0.75        14

     accuracy                           0.77        81
    macro avg       0.77      0.76      0.75        81
 weighted avg       0.78      0.77      0.76        81



In [18]:
#Review how the knn model performed on the out-of-sample data
print(classification_report(validate.actual, validate.predicted))

               precision    recall  f1-score   support

     business       0.60      1.00      0.75         6
entertainment       0.67      0.67      0.67         6
      science       0.67      0.67      0.67         6
       sports       0.67      0.67      0.67         6
   technology       0.00      0.00      0.00         6
        world       0.67      0.80      0.73         5

     accuracy                           0.63        35
    macro avg       0.54      0.63      0.58        35
 weighted avg       0.54      0.63      0.58        35



**Takeaways:**

- The KNN model was best able to predict the category for entertainment, science, sports and world articles
- Technology predictions were the worst
- Overall, the KNN model performed worst that the logistic regression model

**How does the KNN model do with a higher k? k = 10?**

In [19]:
#Create the KNN object with a k = 10
knn = KNeighborsClassifier(n_neighbors=10, weights='uniform')

#Fit the object to the vectorized training data
knn.fit(X_train_vectorized, y_train)

#Create corresponding dataframes for the actual values of the categories that correspond to each article
train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

#Form predictions uisng the knn model
train['predicted'] = knn.predict(X_train_vectorized)
validate["predicted"] = knn.predict(X_validate_vectorized)
test['predicted'] = knn.predict(X_test_vectorized)

In [20]:
#Review how the knn model performed on the in-sample data
print(classification_report(train.actual, train.predicted))

               precision    recall  f1-score   support

     business       0.63      0.92      0.75        13
entertainment       0.83      0.77      0.80        13
      science       0.88      1.00      0.93        14
       sports       0.86      0.86      0.86        14
   technology       0.75      0.46      0.57        13
        world       0.83      0.71      0.77        14

     accuracy                           0.79        81
    macro avg       0.80      0.79      0.78        81
 weighted avg       0.80      0.79      0.78        81



In [21]:
#Review how the knn model performed on the out-of-sample data
print(classification_report(validate.actual, validate.predicted))

               precision    recall  f1-score   support

     business       0.71      0.83      0.77         6
entertainment       0.57      0.67      0.62         6
      science       0.67      0.67      0.67         6
       sports       0.67      0.67      0.67         6
   technology       0.00      0.00      0.00         6
        world       0.67      0.80      0.73         5

     accuracy                           0.60        35
    macro avg       0.55      0.61      0.57        35
 weighted avg       0.54      0.60      0.57        35



**Takeaway:**

- Overall, the KNN model with k = 10 performed better than k=5, but still worse than the logistic model

**Random Forest Model**



In [22]:
#Create the RF object
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=15, 
                            random_state=123)

In [23]:
#Fit the RF object to the training data
rf.fit(X_train_vectorized, y_train)

#Predict on y
y_pred = rf.predict(X_train_vectorized)

#Evaluate
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train_vectorized, y_train)))

Accuracy of random forest classifier on training set: 0.89


In [24]:
#Predict using in-sample data
print(classification_report(y_train, y_pred))

               precision    recall  f1-score   support

     business       0.91      0.77      0.83        13
entertainment       1.00      0.85      0.92        13
      science       0.93      0.93      0.93        14
       sports       1.00      0.93      0.96        14
   technology       0.79      0.85      0.81        13
        world       0.78      1.00      0.88        14

     accuracy                           0.89        81
    macro avg       0.90      0.89      0.89        81
 weighted avg       0.90      0.89      0.89        81



In [25]:
#Predict using out of sample data
y_pred = rf.predict(X_validate_vectorized)

print(classification_report(y_validate, y_pred))

               precision    recall  f1-score   support

     business       0.60      0.50      0.55         6
entertainment       1.00      0.67      0.80         6
      science       0.40      0.33      0.36         6
       sports       0.83      0.83      0.83         6
   technology       0.25      0.17      0.20         6
        world       0.36      0.80      0.50         5

     accuracy                           0.54        35
    macro avg       0.57      0.55      0.54        35
 weighted avg       0.58      0.54      0.54        35



**Takeaway:**

RF does not do well, even when changing max_depth and min_sample_leaves


**Validate Model Performance Using the Best Performing Model on the Validate DF**

In [26]:
#Create the tfidf vectorizer object
tfidf = TfidfVectorizer()

#Fit the object on the training data
tfidf.fit(X_train)

#Use the object
X_train_vectorized =tfidf.transform(X_train)
X_validate_vectorized =tfidf.transform(X_validate) 
X_test_vectorized =tfidf.transform(X_test)

#Using the vectorized data, make a logistic regression model
lm = LogisticRegression()

#Fit the lm object to the vectorized data
lm.fit(X_train_vectorized, y_train)

#Create corresponding dataframes for the actual values of the categories that correspond to each article
train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

#Form predictions uisng the lm model
train['predicted'] = lm.predict(X_train_vectorized)
validate["predicted"] = lm.predict(X_validate_vectorized)
test['predicted'] = lm.predict(X_test_vectorized)


In [27]:
#Review how the knn model performed on the out-of-sample data
print(classification_report(test.actual, test.predicted))

               precision    recall  f1-score   support

     business       0.50      0.40      0.44         5
entertainment       1.00      0.80      0.89         5
      science       0.57      0.80      0.67         5
       sports       1.00      0.80      0.89         5
   technology       0.50      0.20      0.29         5
        world       0.44      0.80      0.57         5

     accuracy                           0.63        30
    macro avg       0.67      0.63      0.62        30
 weighted avg       0.67      0.63      0.62        30



**Takeaways:**

- The LM model accurately predicts entertainment and sports articles 100% of the time.
- It does not perform as well as the model did on the validate data for the other categories and it has a accuracy of  44% for classifying world articles.
