In [1]:
import numpy as np
import pandas as pd

In [2]:
#loading data
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')

In [3]:
train.head()

Unnamed: 0,Inv_Id,Vendor_Code,GL_Code,Inv_Amt,Item_Description,Product_Category
0,15001,VENDOR-1676,GL-6100410,83.24,Artworking/Typesetting Production Jun 2009 Cha...,CLASS-1963
1,15002,VENDOR-1883,GL-2182000,51.18,Auto Leasing Corporate Services Corning Inc /N...,CLASS-1250
2,15004,VENDOR-1999,GL-6050100,79.02,Store Management Lease/Rent Deltona Corp Real ...,CLASS-1274
3,15005,VENDOR-1771,GL-6101400,48.5,Store Construction General Requirements Coloni...,CLASS-1522
4,15006,VENDOR-1331,GL-2182000,63.35,Jul 2015 Aydin Corp Contingent Labor/Temp Labo...,CLASS-1376


In [4]:
##Description of data

In [5]:
train.Vendor_Code.describe()

count            5566
unique           1253
top       VENDOR-1883
freq              330
Name: Vendor_Code, dtype: object

In [6]:
train.GL_Code.describe()

count           5566
unique             9
top       GL-6050310
freq            1618
Name: GL_Code, dtype: object

In [7]:
train.Inv_Amt.describe()

count    5566.000000
mean       49.980151
std        28.903030
min         0.010000
25%        24.957500
50%        49.645000
75%        75.170000
max        99.990000
Name: Inv_Amt, dtype: float64

In [8]:
train.Item_Description.describe()

count                                                  5566
unique                                                 5558
top       Ground Transportation Travel and Entertainment...
freq                                                      2
Name: Item_Description, dtype: object

In [9]:
train.Product_Category.describe()

count           5566
unique            36
top       CLASS-1758
freq            1498
Name: Product_Category, dtype: object

In [10]:
test.Vendor_Code.describe()

count            2446
unique            798
top       VENDOR-1883
freq              151
Name: Vendor_Code, dtype: object

In [11]:
test.GL_Code.describe()

count           2446
unique             9
top       GL-6050310
freq             694
Name: GL_Code, dtype: object

In [12]:
test.Inv_Amt.describe()

count    2446.000000
mean       49.449321
std        28.593046
min         0.040000
25%        25.395000
50%        49.230000
75%        73.327500
max        99.940000
Name: Inv_Amt, dtype: float64

In [13]:
test.Item_Description.describe()

count                                                  2446
unique                                                 2446
top       Store Management Base Rent Real Estate Lease/R...
freq                                                      1
Name: Item_Description, dtype: object

In [14]:
#Forming copy of dataframes
train_new = train.copy()
test_new = test.copy()

In [15]:
## Vendor Code Slicing

train_new.Vendor_Code = train.Vendor_Code.str[-4:]
test_new.Vendor_Code = test.Vendor_Code.str[-4:]

In [16]:
## GL Code Slicing
train_new.GL_Code = train.GL_Code.str[3:]
test_new.GL_Code = test.GL_Code.str[3:]

In [17]:
#Item Description
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

vectors = vectorizer.fit_transform(train.Item_Description)
vectors2= vectorizer.transform(test.Item_Description)

In [18]:
#converting vectors to array
df1 = pd.DataFrame(vectors.toarray(), columns=vectorizer.get_feature_names())
df2 = pd.DataFrame(vectors2.toarray(), columns=vectorizer.get_feature_names())

In [19]:
#one hot encoding first 2 columns (as categorical data)
from sklearn.preprocessing import OneHotEncoder

data = train_new.loc[:,'Vendor_Code':'GL_Code']
enc = OneHotEncoder()
enc_data = enc.fit_transform(data)
df3 = pd.DataFrame(enc_data.toarray())

data_test = test_new.loc[:,'Vendor_Code':'GL_Code']
enc_data_test = enc.transform(data_test)
df4 = pd.DataFrame(enc_data_test.toarray())

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [20]:
#forming new dataframes with vectorized columns added (using Item Description)
train_new_new = pd.concat([train_new, df1, df3], axis=1)
train_new_new= train_new_new.drop(['Item_Description', 'Inv_Id', 'Vendor_Code', 'GL_Code'], axis= 1)

test_new_new = pd.concat([test_new, df2, df4], axis=1)
test_new_new= test_new_new.drop(['Item_Description', 'Inv_Id', 'Vendor_Code', 'GL_Code'], axis= 1)

In [21]:
#separating X(features) and Y(data from training data
X = train_new_new.drop('Product_Category', axis= 1)
y = train_new_new['Product_Category']

In [22]:
# #Normalizing data
# from sklearn.preprocessing import Normalizer
# transformer = Normalizer().fit(X)
# X = transformer.transform(X)

In [23]:
# test_new_new_X = transformer.transform(test_new_new)

In [24]:
#splitting data in training and validation set for checking underfitting and overfitting
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33, random_state=42)

In [25]:
#using MultinomailNB for checking accuracy
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(X_train, y_train)

clf.score(X_train, y_train)

0.8176454813622955

In [26]:
clf.score(X_valid, y_valid)

0.809471965160588

In [27]:
#predicting labels for test data
pred = clf.predict(test_new_new)

In [28]:
#forming submission file
Product_Category = pd.DataFrame(pred, columns = ['Product_Category'])
Inv_Id = test_new.Inv_Id
submission= pd.concat([Inv_Id, Product_Category], axis=1)

In [29]:
submission.head()

Unnamed: 0,Inv_Id,Product_Category
0,15003,CLASS-1758
1,15008,CLASS-1522
2,15013,CLASS-1522
3,15019,CLASS-1376
4,15020,CLASS-1758


In [30]:
#saving submission file
submission.to_csv('submission.csv', index= False)

In [45]:
#using RandomForestClassifier for checking accuracy
from sklearn.ensemble import RandomForestClassifier
clf1 = RandomForestClassifier()
clf1.fit(X_train, y_train)
clf1.score(X_train, y_train)



1.0

In [46]:
clf1.score(X_valid, y_valid)

0.9940119760479041

In [47]:
importance1 = clf1.feature_importances_
importance1 = pd.DataFrame(importance1, index=X.columns, 
                          columns=["Importance"])

In [49]:
importance1["Importance"].sort_values(ascending=False).head(10)

corporate    0.053006
travel       0.050929
ground       0.048233
general      0.040155
1255         0.039434
only         0.034280
1256         0.027577
store        0.027558
rent         0.027007
up           0.026945
Name: Importance, dtype: float64

In [50]:
#predicting labels for test data
pred1 = clf1.predict(test_new_new)

In [51]:
#forming submission file
Product_Category1 = pd.DataFrame(pred1, columns = ['Product_Category'])
Inv_Id = test_new.Inv_Id
submission1= pd.concat([Inv_Id, Product_Category1], axis=1)

In [52]:
submission1.head()

Unnamed: 0,Inv_Id,Product_Category
0,15003,CLASS-1758
1,15008,CLASS-1522
2,15013,CLASS-1522
3,15019,CLASS-1376
4,15020,CLASS-1758


In [53]:
#saving submission file
submission1.to_csv('submission1.csv', index= False)

In [81]:
from sklearn.svm import SVC
clf2 =  SVC(kernel= 'linear')
clf2.fit(X_train, y_train)
clf2.score(X_train, y_train)

In [None]:
clf2.score(X_valid, y_valid)

In [None]:
#predicting labels for test data
pred2 = clf2.predict(test_new_new)

In [None]:
#forming submission file
Product_Category2 = pd.DataFrame(pred2, columns = ['Product_Category'])
Inv_Id = test_new.Inv_Id
submission2= pd.concat([Inv_Id, Product_Category2], axis=1)

In [None]:
#saving submission file
submission2.to_csv('submission2.csv', index= False)

In [54]:
#using adaboost
from sklearn.ensemble import AdaBoostClassifier
clf3 =  AdaBoostClassifier()
clf3.fit(X_train, y_train)
clf3.score(X_train, y_train)

0.8557253955484044

In [55]:
clf3.score(X_valid, y_valid)

0.8508437670114317

In [56]:
importance3 = clf3.feature_importances_
importance3 = pd.DataFrame(importance3, index=X.columns, 
                          columns=["Importance"])

In [91]:
importance3["Importance"].sort_values(ascending=False).head(10)

leasing         0.26
labor           0.16
rent            0.10
lease           0.10
base            0.08
temp            0.06
up              0.02
only            0.02
requirements    0.02
repair          0.02
Name: Importance, dtype: float64

In [59]:
#predicting labels for test data
pred3 = clf3.predict(test_new_new)

In [60]:
#forming submission file
Product_Category3 = pd.DataFrame(pred3, columns = ['Product_Category'])
Inv_Id = test_new.Inv_Id
submission3= pd.concat([Inv_Id, Product_Category3], axis=1)

In [61]:
#saving submission file
submission3.to_csv('submission3.csv', index= False)

In [62]:
from sklearn.linear_model import LogisticRegression
clf4 = LogisticRegression()
clf4.fit(X_train, y_train)
clf4.score(X_train, y_train)



0.995172968624296

In [63]:
clf4.score(X_valid, y_valid)

0.9863908546543277

In [65]:
#predicting labels for test data
pred4 = clf4.predict(test_new_new)

In [66]:
#forming submission file
Product_Category4 = pd.DataFrame(pred4, columns = ['Product_Category'])
Inv_Id = test_new.Inv_Id
submission4= pd.concat([Inv_Id, Product_Category4], axis=1)

In [67]:
#saving submission file
submission4.to_csv('submission4.csv', index= False)

In [70]:
from sklearn.ensemble import GradientBoostingClassifier
clf5 = GradientBoostingClassifier()
clf5.fit(X_train, y_train)
clf5.score(X_train, y_train)

1.0

In [71]:
clf5.score(X_valid, y_valid)

0.9967338051170387

In [72]:
#predicting labels for test data
pred5 = clf5.predict(test_new_new)

In [73]:
#forming submission file
Product_Category5 = pd.DataFrame(pred5, columns = ['Product_Category'])
Inv_Id = test_new.Inv_Id
submission5= pd.concat([Inv_Id, Product_Category5], axis=1)

In [74]:
#saving submission file
submission5.to_csv('submission5.csv', index= False)

In [76]:
from sklearn.neural_network import MLPClassifier

clf6 = MLPClassifier()
clf6.fit(X_train, y_train)
clf6.score(X_train, y_train)

1.0

In [77]:
clf6.score(X_valid, y_valid)

0.9918345127925966

In [78]:
#predicting labels for test data
pred6 = clf6.predict(test_new_new)

In [79]:
#forming submission file
Product_Category6 = pd.DataFrame(pred6, columns = ['Product_Category'])
Inv_Id = test_new.Inv_Id
submission6= pd.concat([Inv_Id, Product_Category6], axis=1)

In [80]:
#saving submission file
submission6.to_csv('submission6.csv', index= False)

In [82]:
from xgboost import XGBClassifier
clf7 = MLPClassifier()
clf7.fit(X_train, y_train)
clf7.score(X_train, y_train)

1.0

In [83]:
clf7.score(X_valid, y_valid)

0.9929232444202504

In [87]:
#predicting labels for test data
pred7 = clf7.predict(test_new_new)

In [88]:
#forming submission file
Product_Category7 = pd.DataFrame(pred7, columns = ['Product_Category'])
Inv_Id = test_new.Inv_Id
submission7= pd.concat([Inv_Id, Product_Category7], axis=1)

In [90]:
#saving submission file
submission7.to_csv('submission7.csv', index= False)

In [92]:
from sklearn.neighbors import KNeighborsClassifier
clf8 = KNeighborsClassifier()
clf8.fit(X_train, y_train)
clf8.score(X_train, y_train)

0.8905872888173774

In [93]:
clf8.score(X_valid, y_valid)

0.8356015242242787