# Exercises

## Ex: CountVectorizer

In [None]:
# from sklearn.datasets import load_files
# imdb_train = load_files('data/aclimdb/train')
# X_train,y_train = imdb_train.data[:10000], imdb_train.target[:10000]
# len(X_train)

In [None]:
# imdb_test = load_files('data/aclimdb/test')
# X_test,y_test = imdb_test.data[:2500], imdb_test.target[:2500]
# len(X_test)

In [1]:
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle=True, random_state=1,remove=('headers', 'footers', 'quotes'))
X,y = dataset.data[:10000], dataset.target[:10000]

In [2]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0, stratify = y)

In [3]:
dataset['target_names']

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [13]:
X_train[1]

'\n   I agree with you.\n'

In [14]:
X_test[0]

'\nHumans have "gone somewhat beyond" what, exactly?    In one thread\nyou\'re telling us that natural morality is what animals do to\nsurvive, and in this thread you are claiming that an omniscient\nbeing can "definitely" say what is right and what is wrong.   So\nwhat does this omniscient being use for a criterion?   The long-\nterm survival of the human species, or what?\n\nHow does omniscient map into "definitely" being able to assign\n"right" and "wrong" to actions?\n\n\nWell, your "original premises" have a habit of changing over time,\nso perhaps you\'d like to review it for us, and tell us what the\ndifference is between an omniscient being be able to assign "right"\nand "wrong" to actions, and telling us the result, is. \n\n\nI\'m talking about the morality introduced by you, which was going to\nbe implemented by this omniscient being that can "definitely" assign\n"right" and "wrong" to actions.\n\nYou tell us what type of morality that is.'

In [15]:
y_train[0]

2

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer().fit(X_train)
X_train_cv = cv.transform(X_train)
X_test_cv = cv.transform(X_test)

In [17]:
len(cv.get_feature_names())

62113

In [18]:
# cv.get_feature_names()[:50]

In [19]:
# cv.get_feature_names()[50000:50050]

In [20]:
X_train_cv.shape

(6000, 62113)

In [21]:
X_train_cv.shape

(6000, 62113)

In [22]:
X_train_cv[0].toarray()

array([[0, 0, 0, ..., 0, 0, 0]])

In [23]:
X_train_cv[0].toarray()[0]

array([0, 0, 0, ..., 0, 0, 0])

In [24]:
words = X_train_cv[0].toarray()[0].nonzero()
words

(array([], dtype=int64),)

In [25]:
X_train_cv[0].toarray()[0][words]

array([], dtype=int64)

## Ex: Cross Validation

In [33]:
import pandas as pd
credit = pd.read_csv('data/credit.csv')

In [35]:
credit.head()

Unnamed: 0,Creditability,Account Balance,Duration of Credit (month),Payment Status of Previous Credit,Purpose,Credit Amount,Value Savings/Stocks,Length of current employment,Instalment per cent,Sex & Marital Status,...,Duration in Current address,Most valuable available asset,Age (years),Concurrent Credits,Type of apartment,No of Credits at this Bank,Occupation,No of dependents,Telephone,Foreign Worker
0,1,1,18,4,2,1049,1,2,4,2,...,4,2,21,3,1,1,3,1,1,1
1,1,1,9,4,0,2799,1,3,2,3,...,2,1,36,3,1,2,3,2,1,1
2,1,2,12,2,9,841,2,4,2,2,...,4,1,23,3,1,1,2,1,1,1
3,1,1,12,4,0,2122,1,3,3,3,...,2,1,39,3,1,2,2,2,1,2
4,1,1,12,4,0,2171,1,3,4,3,...,4,2,38,1,2,2,2,1,1,2


In [37]:
y = credit['Creditability']
X = credit.drop(columns = ['Creditability'])

In [38]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

In [39]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [40]:
clf.score(X_test,y_test)

0.7575

In [42]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf, X_train, y_train, cv=5)



In [43]:
scores.mean(), scores.std()

(0.7449935180683843, 0.029648094590487534)

## Ex: Pipeline

In [60]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

pipe = Pipeline([('scaler', MinMaxScaler()),
                 ('clf', LogisticRegression())])
pipe.fit(X_train,y_train)
pipe.score(X_test,y_test)



0.7625

## Ex: GridSearchCV

In [61]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier

pipe = Pipeline([('scaler', MinMaxScaler()), 
                 ('tree', DecisionTreeClassifier())])

In [62]:
from sklearn.model_selection import GridSearchCV

param_grid = {'tree__max_depth':range(2,8), 
              'tree__min_samples_split':range(2,10)}

clf = GridSearchCV(pipe, param_grid,cv=5)
clf.fit(X_train,y_train)

print(clf.best_params_)    
print(clf.best_score_)

{'tree__max_depth': 3, 'tree__min_samples_split': 2}
0.715


In [63]:
clf.score(X_test,y_test)

0.6975

## Ex: RandomizedSearchCV

In [64]:
from sklearn.model_selection import RandomizedSearchCV
param_grid = {'tree__max_depth':range(2,8), 
              'tree__min_samples_split':range(2,10)}
clf = RandomizedSearchCV(pipe,param_grid,cv=5).fit(X_train,y_train)

print(clf.best_params_)    
print(clf.best_score_)

{'tree__min_samples_split': 8, 'tree__max_depth': 3}
0.715


In [65]:
clf.score(X_test,y_test)

0.6975

## Ex: Ensemble Estimators

In [67]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier

pipe = Pipeline([('scaler', MinMaxScaler()), ('gb', AdaBoostClassifier())])

In [68]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {'gb__n_estimators':range(10,100,10)}
clf = RandomizedSearchCV(pipe,param_grid,cv=5).fit(X_train,y_train)



In [69]:
print(clf.best_params_)    
print(clf.best_score_)

{'gb__n_estimators': 20}
0.7533333333333333


In [70]:
clf.score(X_test,y_test)

0.735