In [23]:
## Cassandra Round - 1 ##
## Team Name: PAV_BHU_JEE ##



In [24]:
# Importing essential libraries

import pandas as pd 
import numpy as np 
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [25]:
# Loading Training and test datasets

training_data = pd.read_csv("../input/cassandra-udyam-data-ps-2/trainData (1).csv")#,index_col="id"
test_data = pd.read_csv("../input/cassandra-udyam-data-ps-2/testData.csv")
X = training_data.copy()

# print(X.columns)
# print(X.describe())

In [26]:
# converting to proper pandas datetime type

X['Created'] = pd.to_datetime(X['Created'])
X['Invoice_Date'] = pd.to_datetime(X['Invoice_Date'])
X['Due_Date'] = pd.to_datetime(X['Due_Date'])

# print(X.columns)

# Extracting 'Day of the month','month','year','hours','minutes',into seperate columns,
# so as to analyze their individual relationships with target variable.

X['Created Day']=X['Created'].dt.day
X['Created Month']=X['Created'].dt.month
X['Created Year']=X['Created'].dt.year
X['Created Hour']=X['Created'].dt.hour
X['Created Minutes']=X['Created'].dt.minute

X['Invoice Day']=X['Invoice_Date'].dt.day
X['Invoice Month']=X['Invoice_Date'].dt.month
X['Invoice Year']=X['Invoice_Date'].dt.year

X['Due Day']=X['Due_Date'].dt.day
X['Due Month']=X['Due_Date'].dt.month
X['Due Year']=X['Due_Date'].dt.year

X['diff_created'] = X['Due_Date'] - X['Created']
X['diff_created'] = X['diff_created']/np.timedelta64(1,'D')
# sns.distplot(a=X['diff_created'], kde=False)
# plt.show()
# sns.kdeplot(data=X['diff_created'], shade=True)
# plt.show()

X['diff_due'] = X['Due_Date'] - X['Invoice_Date']
X['diff_due'] = X['diff_due']/np.timedelta64(1,'D')
# sns.distplot(a=X['diff_due'], kde=False)
# plt.show()
# sns.kdeplot(data=X['diff_due'], shade=True)
# plt.show()

In [27]:
# checking if the difference columns were added or not 
print(X.columns)

In [28]:
# Frequency encoding Description column

Description_map = X.groupby(['Description'])['Description'].count()
# print(Description_map)

X["Description_encoded"] = X['Description'].map(Description_map)
# print(X.columns)

In [29]:
# Obtaining pandas profiling report which gives information about missing values,
# distinct values, mean, mode, categorical or continous, correlation between fetures etc.

! pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip
from pandas_profiling import ProfileReport

profile = ProfileReport(X,title='Pandas Profiling Report',html={'style':{'full_width':True}})

profile.to_notebook_iframe()

In [30]:
# Seperating the target variable from from dataset.

y = X['Number_of_Days_until_Payment']
X.drop(['Number_of_Days_until_Payment'], axis=1, inplace=True)
# print(y.head())
# print(X.head())
print(X.columns)

In [31]:
# Getting the mutual information scores between various features and target variable.
# MI gives the degree of dependence between features.

from sklearn.feature_selection import mutual_info_regression

X_copy = X.drop(['Vendor_Name','Description','Created','Invoice_Date', 'Due_Date','Description_encoded'], axis=1)
y_copy = y.copy()

# Label encoding for categoricals
for colname in X_copy.select_dtypes("object"):
    X_copy[colname], _ = X_copy[colname].factorize()

# All discrete features should now have integer dtypes (double-check this before using MI!)
discrete_features = X_copy.dtypes == int

# print(discrete_features)

def make_mi_scores(X_copy, y_copy, discrete_features):
    mi_scores = mutual_info_regression(X_copy, y_copy,discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X_copy.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

mi_scores = make_mi_scores(X_copy, y_copy, discrete_features)
mi_scores[::3]
print(mi_scores)

def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")


plt.figure(dpi=100, figsize=(8, 5))
plot_mi_scores(mi_scores)

In [32]:
# Dropping the redundand columns

X.drop(['Description','Created','Invoice_Date', 'Due_Date','Created Minutes'], axis=1, inplace=True)

from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,random_state=0)

In [33]:
# Target encoding the Vendor_Name columns.

from category_encoders import MEstimateEncoder
encoder = MEstimateEncoder(cols=["Vendor_Name"], m=5.0)
# print(X_train.columns)
encoder.fit(X_valid,y_valid)
X_train = encoder.transform(X_train)
# X_valid = encoder.transform(X_valid)
# print(X_train.columns)

In [34]:
# Normalized training data,since I wanted to perform k-Means clustering.

subracter = X_train.mean(axis=0)
divider =  X_train.std(axis=0)
X_train = (X_train - subracter)/(divider)

In [35]:
# Performing K-Means Clustering. This was done just 10 mins before deadline, Hence could not be optimized.

from sklearn.cluster import KMeans


kmeans = KMeans(n_clusters=10, n_init=10, random_state=0)
X_train["Cluster"] = kmeans.fit_predict(X_train[['Vendor_Name', 'Amount','diff_created', 'diff_due']])

In [36]:
# Importing Catboost and MSE(Evaluation metric).

!pip install catboost
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

In [37]:
# Training the model.
# I didn't use validation data since I had already used it for target encoding.

my_model = CatBoostRegressor(n_estimators=600,learning_rate=0.05)
my_model.fit(X_train,y_train,verbose=False)
print(my_model.best_score_)
print("Best iteration :",my_model.best_iteration_)

In [38]:
# Printing the model parameters.
print(my_model.get_all_params())

In [39]:
# Preprocessing the test data.

# print(test_data.head())
X_test = test_data.copy()
X_test['Created'] = pd.to_datetime(X_test['Created'])
X_test['Invoice_Date'] = pd.to_datetime(X_test['Invoice_Date'])
X_test['Due_Date'] = pd.to_datetime(X_test['Due_Date'])

X_test['Created Day']=X_test['Created'].dt.day
X_test['Created Month']=X_test['Created'].dt.month
X_test['Created Year']=X_test['Created'].dt.year
X_test['Created Hour']=X_test['Created'].dt.hour
X_test['Created Minutes']=X_test['Created'].dt.minute

X_test['Invoice Day']=X_test['Invoice_Date'].dt.day
X_test['Invoice Month']=X_test['Invoice_Date'].dt.month
X_test['Invoice Year']=X_test['Invoice_Date'].dt.year

X_test['Due Day']=X_test['Due_Date'].dt.day
X_test['Due Month']=X_test['Due_Date'].dt.month
X_test['Due Year']=X_test['Due_Date'].dt.year

X_test['diff_created'] = X_test['Due_Date'] - X_test['Created']
X_test['diff_created'] = X_test['diff_created']/np.timedelta64(1,'D')
# sns.distplot(a=X['diff_created'], kde=False)
# plt.show()
# sns.kdeplot(data=X['diff_created'], shade=True)
# plt.show()

X_test['diff_due'] = X_test['Due_Date'] - X_test['Invoice_Date']
X_test['diff_due'] = X_test['diff_due']/np.timedelta64(1,'D')

# X_test['Description_null'] = X_test['Description'].isnull()
# X_test['Description_notnull'] = X_test['Description'].notnull()

X_test["Description_encoded"]= X_test['Description'].map(Description_map)
X_test.drop(['Description','Created', 'Invoice_Date', 'Due_Date','Created Minutes'], axis=1, inplace=True)
X_test = encoder.transform(X_test)

X_test =  (X_test - subracter) / divider

X_test['Cluster'] = kmeans.predict(X_test[['Vendor_Name', 'Amount','diff_created', 'diff_due']])
print(X_test.columns)
# print(X_test.head())

In [40]:
# my_model_2 =  CatBoostRegressor(n_estimators=50,learning_rate=0.05)
# my_model_2.fit(X, y,verbose=False)
# print(my_model_2.best_score_)
# print("Best iteration :",my_model_2.best_iteration_)

In [41]:
# Creating and saving test predictions.

predictions = my_model.predict(X_test)
output = pd.DataFrame({'Vendor_Name': test_data['Vendor_Name'],
                       'Number_of_Days_Until_Payment': predictions})
output.to_csv('submission1.csv', index=False)