In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Load the dataset
df = pd.read_csv("https://www.kaggle.com/datasets/heemalichaudhari/adidas-sales-dataset/data")

In [None]:
# Preprocessing
df.info()
df = df.drop('Unnamed: 0', axis=1)
df = df.drop([0, 1, 2], axis=0)
df.columns = df.iloc[0]
df = df.drop(3, axis=0)
df = df.reset_index(drop=True)


In [None]:
# Data Analysis
df.describe()
# Visualization - Total sales per Product
top_products = df[['Product', 'Total Sales']].groupby('Product').sum()
top_products.plot(kind='bar', figsize=(12, 7), title='Total sales per Product')
plt.show()

In [None]:
# Visualization - Total sales per Retailer
top_retailers = df[['Retailer', 'Total Sales']].groupby('Retailer').sum()
top_retailers.plot(kind='pie', subplots=True, figsize=(12, 7), title='Total sales per Retailer', autopct='%1.0f%%')
plt.show()

In [None]:
# Visualization - Products price distribution
df['Price per Unit'].plot(kind='hist', bins=25, figsize=(12, 7), title='Products price distribution')
plt.show()

In [None]:
# Visualization - Top Selling Methods
top_salesmt = df[['Sales Method', 'Total Sales']].groupby('Sales Method').sum()
top_salesmt.plot(kind='barh', title='Top Selling Methods')
plt.show()

In [None]:
# Machine Learning Part
df2 = df.copy()
df2['Region'] = pd.factorize(df2.Region)[0]
df2['State'] = pd.factorize(df2.State)[0]
df2['City'] = pd.factorize(df2.City)[0]
df2['Product'] = pd.factorize(df2.Product)[0]
df2['Retailer'] = pd.factorize(df2.Retailer)[0]
df2.rename(columns={'Sales Method': 'Method'}, inplace=True)
df2['Method'] = pd.factorize(df2.Method)[0]
df2 = df2.drop(['Retailer ID', 'Invoice Date'], axis=1)
df2['Units Sold'] = df2['Units Sold'].astype(int)
df2['Total Sales'] = df2['Total Sales'].astype(int)
df2['Operating Profit'] = df2['Operating Profit'].astype(int)
df2['Retailer'] = df2['Retailer'].astype(int)

In [None]:
# Finding correlation between Features
corr = df2.corr()
print(corr)
sns.heatmap(df2.corr())

In [None]:
# Total Sales prediction using multiple linear regression model
X = df2[['Region', 'State', 'City', 'Product', 'Retailer', 'Method', 'Units Sold', 'Price per Unit', 'Operating Profit']]
Y = df2['Total Sales']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=0)
lr = LinearRegression()

# Training the model
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

In [None]:
# Calculating R squared value
r2 = r2_score(y_test, y_pred)
print("R squared value:", r2)

In [None]:
# Predicting Sales amount
df2['sales_pred'] = lr.predict(X)
plt.scatter(df2['sales_pred'], df2['Total Sales'])
plt.xlabel("sales_pred")
plt.ylabel("Total Sales")
plt.title('Model correlation scatterplot')
plt.show()