In [1]:
import os
import numpy as np
import pandas as pd
import gc
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
!ls ../input/h-and-m-personalized-fashion-recommendations

**Articles**

In [2]:
# articles= pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/articles.csv')
# customers= pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/customers.csv')
transactions_train= pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/transactions_train.csv')

In [None]:
articles.head()

In [None]:
customers.head()

In [None]:
transactions_train.head()

In [None]:
print(customers["age"].mean(), '\n')
print(customers.count(), '\n')
print(articles.count(), '\n')
print(transactions_train.count())

In [None]:
print(transactions_train[["price"]].std(), '\n')
print(transactions_train[["price"]].max(), '\n')
print(transactions_train[["price"]].mean())

In [None]:
transactions_train[['price']].plot(kind='hist',
        alpha=0.7,
        bins=70,
        title='Histogram Of Transaction Prices',
        rot=45,
        grid=True,
        figsize=(12,8),
        fontsize=15)
plt.xlabel('Price')
plt.ylabel("Number Of Transactions");

In [None]:
customers[['age']].plot(kind='hist',
        alpha=0.7,
        bins=84,
        title='Histogram Of Customer Ages',
        rot=45,
        grid=True,
        figsize=(12,8),
        fontsize=15)
plt.xlabel('Age')
plt.ylabel("Number Of Customers");

In [None]:
transactions_train.head()

In [3]:
def label_race(row):
    date = row['t_dat']
    if (date > '2018-03-20' and date <= '2018-06-21') or (date > '2019-03-20' and date <= '2019-06-21') or (date > '2020-03-20' and date <= '2020-06-21'):
        return 'Spring'
    if (date > '2018-06-21' and date <= '2018-09-22') or (date > '2019-06-21' and date <= '2019-09-22') or (date > '2020-06-21' and date <= '2020-09-22'):
        return 'Summer'
    if (date > '2018-09-22' and date <= '2018-12-21') or (date > '2019-09-22' and date <= '2019-12-21') or (date > '2020-09-22' and date <= '2020-12-21'):
        return 'Autumn'
    if (date > '2018-12-21' and date <= '2019-03-20') or (date > '2019-12-21' and date <= '2020-03-20') or (date > '2020-12-21' and date <= '2021-03-20'):
        return 'Winter'

transactions_train['season'] = transactions_train.apply(lambda row: label_race(row), axis=1)


In [117]:
transactions_train

In [121]:
numTransactionsPerDay = transactions_train.groupby(["t_dat"]).size()

In [122]:
print(numTransactionsPerDay)

In [120]:
avgPriceByDay = transactions_train.groupby(["t_dat"]).price.mean()

In [None]:
avgPriceByDay

In [None]:
type(avgPriceByDay)

In [123]:
numTransactionsPerDay.plot.line(figsize=(16,6), title="Transactions per Day", grid=True, xlabel="Date", ylabel="Transactions")

In [141]:
from datetime import date, datetime
import calendar

def getDay(x):
    return calendar.day_name[datetime.strptime(x, '%Y-%m-%d').weekday()]

In [150]:
transactionFreqKeys = numTransactionsPerDay.keys().to_series().apply(getDay)

In [None]:
avgPriceByDay.plot.line(figsize=(16,6), title="Average Price by Date", grid=True, xlabel="Date", ylabel="Average Price")

In [152]:
plt.figure(figsize=(16, 10))
plt.scatter(transactionFreqKeys, numTransactionsPerDay.to_numpy(), label='Data')

plt.title("Transaction Frequency by Day", fontsize=18)
plt.xlabel("Day of the Week", fontsize=16)
plt.ylabel("Number of Transactions", fontsize=16)

plt.show()

In [None]:
transactions = transactions_train.filter(["price", "season"], axis=1)

In [None]:
ax = pd.DataFrame({'Autumn': transactions.groupby('season').get_group('Autumn').price,
              'Spring':   transactions.groupby('season').get_group('Spring').price,
              'Summer':   transactions.groupby('season').get_group('Summer').price,
              'Winter':   transactions.groupby('season').get_group('Winter').price}).plot.hist(stacked=True, figsize=(10,6), title="Transaction Prices by Season", bins=70)
ax.set_xlabel("Transaction Cost")
ax.set_ylabel("Frequency of Cost (in hundred thousands)")

In [None]:
transactions

In [90]:
from scipy import optimize
from sklearn.metrics import r2_score

def test_func (x, a, b, c, d):
    return a * np.cos(b*x+c) + d

In [11]:
y_data = avgPriceByDay.to_numpy()
x_data = avgPriceByDay.keys().to_numpy()

In [62]:
temp_data = np.arange(x_data.size)

In [102]:
params, params_covariance = optimize.curve_fit(test_func, 
                                               temp_data, 
                                               y_data, 
                                               p0=(2, 0.02, 0, 0.028))

In [103]:
print(params, params_covariance)

In [116]:
plt.figure(figsize=(16, 10))
plt.scatter(temp_data, y_data, label='Data')
plt.plot(temp_data, test_func(temp_data, params[0], params[1], params[2], params[3]),
         label='Fitted function')

plt.legend(loc='best')
plt.title("Modelling transaction prices as a wave equation", fontsize=18)
plt.xlabel("Days into test", fontsize=16)
plt.ylabel("Transaction price", fontsize=16)

plt.show()

In [107]:
coefficient_of_dermination = r2_score(y_data, 
                                      test_func(temp_data, 
                                                params[0], 
                                                params[1], 
                                                params[2], 
                                                params[3]))
print(coefficient_of_dermination)