# model nlp

In [33]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import nltk
import unicodedata
import re
from env import user, password, host
import numpy as np
import acquire as a
from wordcloud import WordCloud
import nltk.sentiment
import requests
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split


# Exercises

Do your work for this exercise in a file named model.

Take the work we did in the lessons further:

* What other types of models (i.e. different classifcation algorithms) could you use?
* How do the models compare when trained on term frequency data alone, instead of TF-IDF values?

Goal: to predict if the category of an article is Technology

# acquire Data

In [83]:
ADDITIONAL_STOPWORDS = ['r', 'u', '2', 'ltgt']
def clean(text):
    'A simple function to cleanup text data'
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = nltk.corpus.stopwords.words('english') + ADDITIONAL_STOPWORDS
    text = (unicodedata.normalize('NFKD', text)
             .encode('ascii', 'ignore')
             .decode('utf-8', 'ignore')
             .lower())
    words = re.sub(r'[^\w\s]', '', text).split()
    return [wnl.lemmatize(word) for word in words if word not in stopwords]

In [84]:
inshort_df = a.inshort_info()

In [85]:
inshort_df = pd.DataFrame(inshort_df)

In [86]:
sports_words = clean(' '.join(inshort_df.content[inshort_df.category == 'sports']))
politics_words = clean(' '.join(inshort_df.content[inshort_df.category == 'politcs']))
technology_words = clean(' '.join(inshort_df.content[inshort_df.category == 'technology']))
science_words = clean(' '.join(inshort_df.content[inshort_df.category == 'science']))
entertainment_words = clean(' '.join(inshort_df.content[inshort_df.category == 'entertainment']))
world_words = clean(' '.join(inshort_df.content[inshort_df.category == 'world']))

In [87]:
inshort_df.category.value_counts()

national         25
business         25
sports           25
politics         25
startup          25
hatke            25
automobile       25
world            24
entertainment    24
science          24
technology       23
miscellaneous    23
Name: category, dtype: int64

In [88]:
inshort_df[inshort_df.category== 'technology']

Unnamed: 0,title,content,category
124,Indians write 35% of 100 mn lines of codes for...,Finance Minister Nirmala Sitharaman said that ...,technology
125,TCS to hire up to 1.5 lakh employees in the ne...,Tata Consultancy Services (TCS) is planning to...,technology
126,Tata could be India's first homegrown iPhone m...,Tata Group is nearing the takeover of a major ...,technology
127,"Who is AC Charania, NASA's new Chief Technolog...",Indian-American AC Charania has been named NAS...,technology
128,First-ever rocket launch from UK ends in failu...,The first-ever rocket launch from the UK ended...,technology
129,Musk's tweet about disabling driver monitoring...,The US NHTSA has questioned Tesla over its CEO...,technology
130,Ex-Coinbase manager's brother gets 10 months p...,The brother of former Coinbase product manager...,technology
131,CCI order puts Indian Android users' privacy a...,Google has reportedly told the Supreme Court i...,technology
132,Tata-run iPhone plant would boost India's elec...,Tata Group succeeding with its bid to take ove...,technology
133,"Instagram to redesign home screen, remove Shop...",Instagram said on Tuesday that it is simplifyi...,technology


In [89]:
inshort_df['technology']= inshort_df.category.map({'technology':'yes' })

In [90]:
inshort_df.technology.fillna('no', inplace=True)

In [91]:
inshort_df

Unnamed: 0,title,content,category,technology
0,"Woman, fiance beaten up by bouncers at Gurugra...",A woman and her fiance suffered injuries after...,national,no
1,"Indian-origin man gets 13-year jail, 10 stroke...",An Indian-origin man has been sentenced to 13 ...,national,no
2,India successfully carries out training launch...,A successful training launch of a short-range ...,national,no
3,"NOCs of over 2,000 CBSE schools in Maha to be ...",Maharashtra Education Commissioner Suraj Mandh...,national,no
4,"Cong is embarrassed by what Rahul says, he's i...",BJP leader and Haryana Chief Minister Manohar ...,national,no
...,...,...,...,...
288,Stellantis may shut more auto plants due to hi...,Stellantis NV CEO Carlos Tavares said on Thurs...,automobile,no
289,Tesla owners in China demand refund after sudd...,Around 200 recent buyers of Tesla's Model Y an...,automobile,no
290,Rolls-Royce reports record car sales in 118 years,Luxury car maker Rolls-Royce has sold 6021 car...,automobile,no
291,Tata Motors completes ₹725-crore acquisition o...,Tata Motors on Tuesday said that the acquisiti...,automobile,no


# Split Data

In [92]:
def split_data(df, target):
    '''
    split_date takes in a dataframe  and target variable and splits into train , validate, test 
    and stratifies on target variable
    
    The split is 20% test 80% train/validate. Then 30% of 80% validate and 70% of 80% train.
    Aproximately (train 56%, validate 24%, test 20%)
    
    returns train, validate, and test 
    '''
    # split test data from train/validate
    train_validate, test = train_test_split(df, test_size=.2, 
                                        random_state=123, 
                                        stratify=df[target])

    # split train from validate
    train, validate = train_test_split(train_validate, test_size=.3, 
                                   random_state=123, 
                                   stratify=train_validate[target])

                                   
    return train, validate, test

In [93]:
train, validate, test = split_data(inshort_df,'technology')

In [95]:
tech_article = train[train.technology=='yes']
non_tech_articles = train[train.technology == 'no']

# model

In [96]:
from sklearn.feature_extraction.text import CountVectorizer

# same basic process as any sklearn transformation:
# make the thing
cv = CountVectorizer()
# use the thing
bag_of_words = cv.fit_transform(train.content)

In [97]:
bag_of_words

<163x2956 sparse matrix of type '<class 'numpy.int64'>'
	with 7440 stored elements in Compressed Sparse Row format>

In [98]:
bag_of_words.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [99]:
cv.get_feature_names()



['00',
 '000',
 '01',
 '058',
 '06',
 '094',
 '10',
 '100',
 '104',
 '108',
 '11',
 '113',
 '118',
 '12',
 '122',
 '123',
 '12th',
 '13',
 '131',
 '14',
 '149',
 '15',
 '150',
 '155',
 '156',
 '16',
 '163',
 '164',
 '16th',
 '17',
 '179',
 '18',
 '19',
 '1903',
 '1984',
 '20',
 '2002',
 '2007',
 '2008',
 '2009',
 '2014',
 '2016',
 '2017',
 '2018',
 '2019',
 '202',
 '2020',
 '2021',
 '2022',
 '2023',
 '2024',
 '2025',
 '2026',
 '2030',
 '2032',
 '2034',
 '2040',
 '21',
 '210',
 '21st',
 '22',
 '220',
 '224',
 '225',
 '23',
 '230',
 '232',
 '24',
 '240',
 '25',
 '26',
 '283',
 '29',
 '300',
 '31',
 '32',
 '323',
 '329',
 '33',
 '34',
 '35',
 '352',
 '354',
 '38',
 '384',
 '39',
 '40',
 '41',
 '417',
 '42',
 '442',
 '45',
 '452',
 '47',
 '50',
 '500',
 '52',
 '529',
 '53',
 '54',
 '55',
 '58',
 '60',
 '600',
 '6021',
 '604',
 '65',
 '66',
 '67',
 '70',
 '71',
 '72',
 '725',
 '73rd',
 '747',
 '75',
 '76',
 '771',
 '800',
 '80s',
 '82',
 '83',
 '85',
 '86',
 '87',
 '88',
 '900',
 '950',
 '9

In [100]:
cv.vocabulary_

{'video': 2818,
 'has': 1266,
 'gone': 1208,
 'viral': 2826,
 'showing': 2415,
 'an': 259,
 'autorickshaw': 354,
 'moving': 1763,
 'in': 1365,
 'circles': 626,
 'without': 2907,
 'driver': 914,
 'maharashtra': 1639,
 'ratnagiri': 2161,
 'the': 2656,
 'several': 2380,
 'people': 1970,
 'can': 543,
 'be': 402,
 'seen': 2358,
 'trying': 2731,
 'to': 2685,
 'stop': 2535,
 'it': 1448,
 'as': 309,
 'per': 1971,
 'reports': 2227,
 'steering': 2521,
 'of': 1859,
 'got': 1212,
 'locked': 1608,
 'after': 204,
 'fell': 1069,
 'started': 2508,
 'running': 2290,
 'around': 299,
 'circular': 627,
 'motion': 1756,
 'before': 414,
 'being': 419,
 'stopped': 2536,
 'by': 526,
 'onlookers': 1882,
 'countries': 743,
 'have': 1269,
 'expressed': 1027,
 'intent': 1410,
 'adopt': 188,
 'homegrown': 1316,
 'unified': 2765,
 'payments': 1967,
 'interface': 1412,
 'upi': 2781,
 'national': 1792,
 'corporation': 736,
 'india': 1379,
 'npci': 1842,
 'md': 1688,
 'and': 263,
 'ceo': 589,
 'dilip': 867,
 'asbe': 3

In [101]:
bow = pd.DataFrame(bag_of_words.todense())
bow.columns = cv.get_feature_names()

In [102]:
bow

Unnamed: 0,00,000,01,058,06,094,10,100,104,108,...,younger,your,yourself,yue,zealand,zero,zerodha,zhu,zuckerberg,ötvös
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
159,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
160,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
161,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [103]:
bow.apply(lambda row: row / row.sum(), axis=1)

Unnamed: 0,00,000,01,058,06,094,10,100,104,108,...,younger,your,yourself,yue,zealand,zero,zerodha,zhu,zuckerberg,ötvös
0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
159,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
160,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
161,0.0,0.0,0.0,0.0,0.0,0.0,0.015625,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# TF-IDF Term Frequency Inverse Document Frequency

In [104]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
bag_of_words = tfidf.fit_transform(train.content)

pd.DataFrame(bag_of_words.todense(), 
             columns=tfidf.get_feature_names())



Unnamed: 0,00,000,01,058,06,094,10,100,104,108,...,younger,your,yourself,yue,zealand,zero,zerodha,zhu,zuckerberg,ötvös
0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
159,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
160,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
161,0.0,0.0,0.0,0.0,0.0,0.0,0.10911,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [105]:
# zip: put these two things of the same length together
# dict: turn those two associated things into a k: v pair
# pd.Series: turn those keys into indeces, and the values into values
pd.Series(
    dict(
        zip(
            tfidf.get_feature_names(), tfidf.idf_)))



00            5.001254
000           3.701971
01            5.406719
058           5.406719
06            5.406719
                ...   
zero          5.406719
zerodha       5.406719
zhu           5.406719
zuckerberg    5.406719
ötvös         5.406719
Length: 2956, dtype: float64

In [106]:
cv = CountVectorizer(ngram_range=(2, 2))
bag_of_grams = cv.fit_transform(train.content)

In [107]:
pd.DataFrame(bag_of_grams.todense(),
            columns=cv.get_feature_names())



Unnamed: 0,00 000,000 000,000 cabs,000 crore,000 electric,000 hens,000 in,000 kilogrammes,000 one,000 or,...,yue 12,zealand and,zero cancellations,zerodha reported,zerodha total,zhu has,zhu the,zhu will,zuckerberg are,ötvös the
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
159,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
160,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
161,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [108]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [109]:
# add additional column with clean text
inshort_df['clean_text'] = inshort_df.content.apply(clean).apply(' '.join)

In [110]:
inshort_df

Unnamed: 0,title,content,category,technology,clean_text
0,"Woman, fiance beaten up by bouncers at Gurugra...",A woman and her fiance suffered injuries after...,national,no,woman fiance suffered injury allegedly beaten ...
1,"Indian-origin man gets 13-year jail, 10 stroke...",An Indian-origin man has been sentenced to 13 ...,national,no,indianorigin man sentenced 13 year jail 10 str...
2,India successfully carries out training launch...,A successful training launch of a short-range ...,national,no,successful training launch shortrange ballisti...
3,"NOCs of over 2,000 CBSE schools in Maha to be ...",Maharashtra Education Commissioner Suraj Mandh...,national,no,maharashtra education commissioner suraj mandh...
4,"Cong is embarrassed by what Rahul says, he's i...",BJP leader and Haryana Chief Minister Manohar ...,national,no,bjp leader haryana chief minister manohar lal ...
...,...,...,...,...,...
288,Stellantis may shut more auto plants due to hi...,Stellantis NV CEO Carlos Tavares said on Thurs...,automobile,no,stellantis nv ceo carlos tavares said thursday...
289,Tesla owners in China demand refund after sudd...,Around 200 recent buyers of Tesla's Model Y an...,automobile,no,around 200 recent buyer tesla model model 3 ga...
290,Rolls-Royce reports record car sales in 118 years,Luxury car maker Rolls-Royce has sold 6021 car...,automobile,no,luxury car maker rollsroyce sold 6021 car 2022...
291,Tata Motors completes ₹725-crore acquisition o...,Tata Motors on Tuesday said that the acquisiti...,automobile,no,tata motor tuesday said acquisition ford india...


In [111]:
X = inshort_df.clean_text
y =inshort_df.technology
X_train, X_test, y_train, y_test = \
train_test_split(X, y, 
                 test_size=0.2, 
                 random_state=1349)

In [112]:
X_train.head()

198    ia officer supriya sahu work additional chief ...
141    droneacharya aerial innovation listed two week...
187    hrithik roshan speaking upcoming aerial action...
222    video gone viral social medium showing guest w...
201    video showing two passenger punching slapping ...
Name: clean_text, dtype: object

In [113]:
y_train.head()

198     no
141    yes
187     no
222     no
201     no
Name: technology, dtype: object

In [114]:
# Whatever transformations we apply to X_train need to be applied to X_test
cv = CountVectorizer()
X_bow = cv.fit_transform(X_train)
tree = DecisionTreeClassifier(max_depth=3)
tree.fit(X_bow, y_train)
tree.score(X_bow, y_train)

0.9444444444444444

In [115]:
# as with any other sklearn transformation, 
# transform only on our validate and/or test, 
# only fit on train
X_test_bow = cv.transform(X_test)
tree.score(X_test_bow, y_test)

0.9152542372881356

In [116]:
pd.Series(
    dict(
    zip(cv.get_feature_names(), 
    tree.feature_importances_))).sort_values().tail()



economy     0.000000
take        0.051451
software    0.193631
insider     0.302183
tc          0.452735
dtype: float64