In [None]:
import cudf
import cupy as cp
import cuml

# **Loading dataset**

In [None]:
trans_train = cudf.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv')
trans_train.head()

In [None]:
cus_train= cudf.read_csv('../input/h-and-m-personalized-fashion-recommendations/customers.csv')
cus_train.head()

In [None]:
arti_train=cudf.read_csv('../input/h-and-m-personalized-fashion-recommendations/articles.csv')
arti_train.head()

# **DATA PREPROCESSING**  
Removed unnecessary columns from training dataset,reducing memory consumption by storing article id as int32 and reducing the memory consumption of training dataset by storing customer id as int64 instead of string 

In [None]:
trans_train['customer_id'] = trans_train['customer_id'].str[-16:].str.hex_to_int().astype('int64')
trans_train['article_id'] = trans_train.article_id.astype('int32')
trans_train.t_dat = cudf.to_datetime(trans_train.t_dat)
trans_train = trans_train[['t_dat','customer_id','article_id']]
trans_train_original = trans_train
print( trans_train.shape )
trans_train.head()

In [None]:
cus_train = cus_train[['customer_id','age']]
cus_train['customer_id'] = cus_train['customer_id'].str[-16:].str.hex_to_int().astype('int64')
cus_train.head()

In [None]:
arti_train=arti_train[['article_id','product_type_no','graphical_appearance_no','colour_group_code']]
arti_train['article_id'] = arti_train.article_id.astype('int32')
arti_train.head()

# **Items purchased  within 2 weeks**

In [None]:
pucr = trans_train.groupby('customer_id').t_dat.max().reset_index()
pucr.columns = ['customer_id','max_dat']


In [None]:
pucr

In [None]:
trans_train = trans_train.merge(pucr,on=['customer_id'],how='left')
trans_train['diff_dat'] = (trans_train.max_dat - trans_train.t_dat).dt.days
trans_train = trans_train.loc[trans_train['diff_dat']<=14]

In [None]:
trans_train['diff_dat'].unique()

# **sorting by most recent date**

In [None]:
pucr = trans_train.groupby(['customer_id','article_id'])['t_dat'].agg('count').reset_index() 
pucr.columns = ['customer_id','article_id','count']


In [None]:
pucr

In [None]:
trans_train = trans_train.merge(pucr,on=['customer_id','article_id'],how='left')
trans_train = trans_train.sort_values(['count','t_dat'],ascending=False)


In [None]:
trans_train

In [None]:
trans_train = trans_train.drop_duplicates(['customer_id','article_id'])
trans_train = trans_train.sort_values(['count','t_dat'],ascending=False)
trans_train= trans_train.reset_index(drop=True)


In [None]:
trans_train

In [None]:
trans_train=trans_train.reset_index(drop=False)
trans_train

# **Recommendation according to age of customer**

In [None]:
cust_age=cudf.merge(trans_train, cus_train, on='customer_id')
cust_age.head()

In [None]:
cust_age=cust_age[['index','customer_id','age','article_id']]
cust_age=cust_age.fillna({'age':18})


In [None]:
cust_age

In [None]:
art_sel = cudf.merge(cust_age, arti_train, on='article_id')

In [None]:
art_sel

In [None]:
output1=art_sel[['age','product_type_no','graphical_appearance_no','colour_group_code']]
output1.head()

In [None]:
output2=art_sel[['article_id']]
output2.head()

In [None]:
output1.to_csv('X_train')

In [None]:
output2.to_csv('Y_train')

In [None]:
#Import required libraries 
import keras #library for neural network
import pandas as pd #loading data in table form  
import seaborn as sns #visualisation 
import matplotlib.pyplot as plt #visualisation
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import normalize #machine learning algorithm library

In [None]:
X_train = pd.read_csv("./X_train")

In [None]:
Y_train = pd.read_csv("./Y_train")

In [None]:
X_train = X_train[['age','product_type_no','graphical_appearance_no','colour_group_code']]
Y_train = Y_train[["article_id"]]

In [None]:
del output1
del output2

In [None]:
#neural netowrkr normal;zing
X_normalized=normalize(X_train,axis=0)

In [None]:
X_normalized

In [None]:
#Creating train,test and validation data
'''
80% -- train data
20% -- test data
'''
total_length=len(X_normalized)
train_length=int(0.8*total_length)
test_length=int(0.2*total_length)

X_train=X_normalized[:train_length]
X_test=X_normalized[train_length:]
y_train=Y_train[:train_length]
y_test=Y_train[train_length:]

print("Length of train set x:",X_train.shape[0],"y:",y_train.shape[0])
print("Length of test set x:",X_test.shape[0],"y:",y_test.shape[0])

In [None]:
#Neural network module
from keras.models import Sequential 
from keras.layers import Dense,Activation,Dropout 
#from keras.layers.normalization import Batch_Normalization 
from keras.utils import np_utils

In [None]:
NN_model = Sequential()
# The Input Layer :
NN_model.add(Dense(128, kernel_initializer='normal',input_dim = X_train.shape[1], activation='relu'))
# The Hidden Layers :
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
# The Output Layer :
NN_model.add(Dense(1, kernel_initializer='normal',activation='linear'))

# Compile the network :
NN_model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])
NN_model.summary()

In [None]:
NN_model.fit(X_train, y_train, epochs=1, batch_size=32, validation_split = 0.2)

In [None]:
NN_model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])

In [None]:
#ceating submission

In [None]:
# submission = cudf.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')

In [None]:
# sub_trans = trans_train[["customer_id" , "article_id"]]
# submission = submission[["customer_id"]]

In [None]:
# submission['customer_id']

In [None]:
# sub_trans['customer_id'] = sub_trans['customer_id'].str[-16:].str.hex_to_int().astype('int64')
# submission['customer_id'] = submission['customer_id'].str[-16:].str.hex_to_int().astype('int64')

In [None]:
# sub_trans

In [None]:
# cust_age= submission.merge( sub_trans, on=['customer_id']  , how = "left")
# cust_age

In [None]:
# c_age = cudf.merge(cust_age , cus_train , on )

In [None]:
# cus_train = cus_train[['customer_id','age']]
# cus_train['customer_id'] = cus_train['customer_id'].str[-16:].str.hex_to_int().astype('int64')
# cus_train.head()

In [None]:
# c_age=cudf.merge(cust_age, article, on='customer_id')

In [None]:
# trans_train=pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv')[:10000]
# trans_train['article_id'] = trans_train.article_id.astype('int32')

# KNN 
Taking k value equal to 13 and using euclidean distance

In [None]:
from cuml.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier(n_neighbors=13,metric='euclidean')

In [None]:
knn.fit(output1[:75000], output2[:75000])

In [None]:
ans=knn.predict(output1[:50000])

In [None]:
y_actual = output2[:50000]

In [None]:
import numpy as np

# Calculating R2 sccore

In [None]:
print(type(y_actual))
type(y_actual['article_id'][0])

In [None]:
y1=y_actual['article_id'].values

In [None]:
y1=y1.astype(float)

In [None]:
ans=ans.astype(float)

In [None]:
type(y1)

In [None]:
y1.flatten()

In [None]:
y1=y1.astype(cp.int_)

In [None]:
y1.shape

In [None]:
y1.reshape(len(y1),1)

In [None]:
ans

In [None]:
ans1=ans.astype(cp.int_)

In [None]:
# data=np.concatenate((ans.reshape(len(ans), 1), output2[]))
from sklearn.metrics import confusion_matrix

In [None]:
type(ans)

In [None]:
ans.shape

In [None]:
type(y1)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
# score=accuracy_score(ans,y1)

In [None]:
final=cuml.metrics.r2_score( ans, y1 )

In [None]:
final

# Submission File

In [None]:
trans_train.article_id = ' 0' + trans_train.article_id.astype('str')
trans_train

In [None]:
p_trans_train = trans_train[['customer_id','article_id']].to_pandas() 
p_trans_train

In [None]:
purc = p_trans_train.groupby('customer_id').sum().reset_index()
purc.columns = ['customer_id','prediction']
trans_train=cudf.DataFrame(purc)

In [None]:
trans_train

In [None]:
trans_train.rename(columns={'customer_id':'customer_id_edited'},inplace=True)
trans_train

In [None]:
submission = cudf.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')
submission = submission[['customer_id']]
submission['customer_id_edited'] = submission['customer_id'].str[-16:].str.hex_to_int().astype('int64')
submission = submission.merge(trans_train, on='customer_id_edited', how='left').fillna('')
del submission['customer_id_edited']
submission


In [None]:
submission.prediction = submission.prediction.str.strip()
submission.prediction = submission.prediction.str[:131]
submission.to_csv('submission.csv',index=False)
submission.head()