In [30]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

import re
import nltk
from nltk.corpus import stopwords
import string
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

nltk.download('stopwords')

df = pd.read_csv('train_file.csv')
print(df.head())
print(df.info())
print(df['Subjects'].nunique())
print(df['Creator'].nunique())
print(df['MaterialType'].unique())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
   ID UsageClass CheckoutType  CheckoutYear  CheckoutMonth  Checkouts  \
0   1   Physical      Horizon          2005              4          1   
1   2   Physical      Horizon          2005              4          1   
2   3   Physical      Horizon          2005              4          3   
3   4   Physical      Horizon          2005              4          1   
4   5   Physical      Horizon          2005              4          1   

                                               Title               Creator  \
0                                         Tidal wave                   NaN   
1                     London holiday / Richard Peck.  Peck, Richard, 1934-   
2  Cinco de Mayo : celebrating Hispanic pride / C...      Gnojewski, Carol   
3                                          Annapolis                   NaN   
4                                  As a man thinketh

In [31]:
df2 = pd.read_csv('test_file.csv')
print(df2.head())

      ID UsageClass CheckoutType  CheckoutYear  CheckoutMonth  Checkouts  \
0  31654   Physical      Horizon          2005              4          1   
1  31655   Physical      Horizon          2005              4          2   
2  31656   Physical      Horizon          2005              4          2   
3  31657   Physical      Horizon          2005              4          2   
4  31658   Physical      Horizon          2005              4          2   

                                               Title  \
0                           Footprints at the window   
1                    Seven brides for seven brothers   
2  History [sound recording] / Loudon Wainwright ...   
3                 Doing big business on the internet   
4                       Lets learn how to dance shag   

                          Creator  \
0                             NaN   
1                             NaN   
2  Wainwright, Loudon, III, 1946-   
3                             NaN   
4                    

Preprocessing: Filling the missing data 
X = Features
Y = Target Variable

In [76]:
df['Subjects'].fillna("Others", inplace=True)
#print(df.info())
df['Publisher'].fillna("Extras", inplace=True)
#print(df.info())

df['Creator'].fillna("Single", inplace=True)
df['PublicationYear'].fillna("Not_Mentioned", inplace=True)


df['Checkouts'] = df['Checkouts'].astype('object')
print(df.info())

df.loc[df['Publisher'].str.contains('viking'), 'Publisher'] = 'vikings'
df.loc[df['Subjects'].str.contains('viking'), 'Subjects'] = 'vikings'
df.loc[df['Title'].str.contains('viking'), 'Title'] = 'vikings'
df.loc[df['Creator'].str.contains('viking'), 'Creator'] = 'vikings'

X = df.Subjects + df.Publisher + df['PublicationYear']
Y = df['MaterialType']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31653 entries, 0 to 31652
Data columns (total 12 columns):
ID                 31653 non-null int64
UsageClass         31653 non-null object
CheckoutType       31653 non-null object
CheckoutYear       31653 non-null int64
CheckoutMonth      31653 non-null int64
Checkouts          31653 non-null object
Title              31653 non-null object
Creator            31653 non-null object
Subjects           31653 non-null object
Publisher          31653 non-null object
PublicationYear    31653 non-null object
MaterialType       31653 non-null object
dtypes: int64(3), object(9)
memory usage: 2.9+ MB
None


# Text Preprocessing: Cleaning the data

1. Removing stopwords
2. Punctuations

In [0]:
def clean_text(text):
    text = text.str.strip().str.lower()
    text = text.str.translate(str.maketrans('', '', '():,.?\'"/+-_[]&%$@'))
    return text

In [0]:
X = clean_text(X)

# Convering text data into vectors using CountVectorizer

In [0]:
#vectorizer = TfidfVectorizer(stop_words='english', strip_accents='unicode', token_pattern=r'\w{1,}', analyzer='word', ngram_range=(1,1))
#vectors1 = vectorizer.fit_transform(X)

vectorizer = CountVectorizer(min_df=1)
vectors1 = vectorizer.fit_transform(X)

In [0]:
X_train, X_test, Y_train, Y_test = train_test_split(vectors1,Y, test_size = 0.2, random_state = 21)

In [0]:
from sklearn.ensemble import BaggingClassifier

In [0]:
#from sklearn.ensemble import AdaBoostClassifier

In [0]:
clf = MultinomialNB(alpha=.03)

In [84]:

seed = 1075
np.random.seed(seed)
bagging_clf = BaggingClassifier(clf, max_samples=0.4, random_state=seed)
bagging_clf.fit(X_train, Y_train)

BaggingClassifier(base_estimator=MultinomialNB(alpha=0.03, class_prior=None, fit_prior=True),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=0.4, n_estimators=10, n_jobs=None, oob_score=False,
         random_state=1075, verbose=0, warm_start=False)

In [85]:
Y_pred=bagging_clf.predict(X_test)
print(Y_pred)

['VIDEOCASS' 'BOOK' 'BOOK' ... 'BOOK' 'BOOK' 'BOOK']


In [86]:
from sklearn.metrics import confusion_matrix
print("Score:",bagging_clf.score(X_test,Y_test))
print("Confusion Matrix:\n",confusion_matrix(Y_test,Y_pred))

Score: 0.7730216395514137
Confusion Matrix:
 [[3610   83  166   35  315   72   76   27]
 [   6   11    1    0    1    0    0    0]
 [  59    0    4    1    0    1    0    0]
 [   0    0    0   13    0   14    0    0]
 [ 134    0    0    0   51    7    5    2]
 [  49    0    4    4   37  715    6    0]
 [  96    3   11    1    4   10  338   66]
 [  18    1    2    2    5   10  103  152]]


In [0]:
#boosting = AdaBoostClassifier(base_estimator=clf, n_estimators= 10)   
#boosting.fit(vectors1, Y)

In [95]:
df2['Subjects'].fillna("Others", inplace=True)
#print(df.info())
df2['Publisher'].fillna("Extras", inplace=True)
#print(df.info())

df2['Creator'].fillna("Single", inplace=True)
df2['PublicationYear'].fillna("Not_Mentioned", inplace=True)

df2['Checkouts'] = df2['Checkouts'].astype('object')
print(df.info())

df2.loc[df2['Publisher'].str.contains('viking'), 'Publisher'] = 'vikings'
df2.loc[df2['Subjects'].str.contains('viking'), 'Subjects'] = 'vikings'
df2.loc[df2['Title'].str.contains('viking'), 'Title'] = 'vikings'

X1 = df2.Subjects + df2.Publisher + df2.PublicationYear

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31653 entries, 0 to 31652
Data columns (total 12 columns):
ID                 31653 non-null int64
UsageClass         31653 non-null object
CheckoutType       31653 non-null object
CheckoutYear       31653 non-null int64
CheckoutMonth      31653 non-null int64
Checkouts          31653 non-null object
Title              31653 non-null object
Creator            31653 non-null object
Subjects           31653 non-null object
Publisher          31653 non-null object
PublicationYear    31653 non-null object
MaterialType       31653 non-null object
dtypes: int64(3), object(9)
memory usage: 2.9+ MB
None


In [0]:
X1 = clean_text(X1)

In [0]:
vectors_test1 = vectorizer.transform(X1)

In [98]:
pred1 = bagging_clf.predict(vectors_test1)
print(pred1)

['BOOK' 'VIDEOCASS' 'SOUNDDISC' ... 'BOOK' 'VIDEOCASS' 'SOUNDDISC']


**Saving these predictions to a csv file**

Creating a new dataframe df3 and storing these predictions of MaterialType in it

In [99]:
df3 = pd.DataFrame()
df3['ID'] = df2['ID']
df3['MaterialType'] = pred1

print(df3.head())

df3.to_csv('Submission.csv', index=False)

      ID MaterialType
0  31654         BOOK
1  31655    VIDEOCASS
2  31656    SOUNDDISC
3  31657         BOOK
4  31658    VIDEOCASS
