In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("/kaggle/input/e-commerce-dataset/ecommerce_dataset_updated.csv")

In [None]:
print("Sample")
print(df.sample(10))
df['Purchase_Date'] = pd.to_datetime(df['Purchase_Date'], format='%d-%m-%Y')

print("\nBasic Statistics")
print(df.describe())

print("\nData Types")
print(df.info())
df["Purchase_Month"] = df['Purchase_Date'].dt.month

print("\nNulls")
print(df.isnull().sum())

print("\nDuplicates")
print(df.duplicated().sum())

In [None]:
print("Non Numeric Cols:")
non_numeric_cols = df.select_dtypes(exclude=[np.number]).columns.to_list()
print(non_numeric_cols)

print("\nCorrelation Matrix:")
correlation_matrix = df.corr(numeric_only = True)
print(correlation_matrix)

In [None]:
plt.figure(figsize = (8,4))
sns.heatmap(correlation_matrix, annot=True)
plt.title("Correlation Matrix")
plt.show()

In [None]:
df['Category'].value_counts().plot(kind = 'bar')
plt.xlabel("Category")
plt.ylabel("Counts of Purchase")
plt.xticks(rotation = 45)
plt.title("Purchases By Category")
plt.show()

In [None]:
df['Payment_Method'].value_counts().plot(kind = 'bar')
plt.title("Distribution of Payment Method")
plt.xlabel("Payment Method")
plt.ylabel("Counts")
plt.xticks(rotation = 45)
plt.show()

In [None]:
sns.boxplot(data = df, y = 'Final_Price(Rs.)', x = 'Category')
plt.title("Boxplot of Final Price By Category")
plt.xlabel("Category")
plt.ylabel("Final Price")
plt.xticks(rotation=45)
plt.show()

In [None]:
sns.histplot(data=df, x = 'Final_Price(Rs.)', bins = 10, kde = True)

In [None]:
label_encoder = {}
cat_cols = ['Category', 'Payment_Method']

for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoder[col] = le    

In [None]:
x = df[['Price (Rs.)', 'Discount (%)', 'Category', 'Payment_Method']]
y = df['Final_Price(Rs.)']

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [None]:
model = LinearRegression()
model.fit(x_train, y_train)

In [None]:
y_pred = model.predict(x_test)

In [None]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("MSE")
print(mse)
print("\nR2")
print(r2)