In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [3]:
df=pd.read_csv('dirty_ecommerce_sales.csv')

In [4]:
df

Unnamed: 0,product_id,marketing_spend,discount_percent,website_traffic,customer_rating,num_reviews,product_category,region,weekly_sales
0,1,1879.430254,19.886650,12000,5.0,1000.0,Beauty,usa,1782.372796
1,2,10035.441129,40.285205,30000,3.0,1000.0,Beauty,usa,50074.691381
2,3,24648.436126,33.049329,40000,4.0,3000.0,Electronics,usa,109873.768722
3,4,946.434728,26.633026,,3.0,1000.0,,,8306.457168
4,5,1923.048498,52.485694,5000,2.0,2000.0,beauty,EU,9822.360658
...,...,...,...,...,...,...,...,...,...
495,496,9509.037175,39.473036,12000,1.0,,Beauty,,41056.087996
496,497,9718.359846,6.161023,5000,2.0,,Electronics,usa,25381.245731
497,498,3379.393046,56.791859,70000,5.0,500.0,fashion,US,-3864.135174
498,499,8412.433587,24.120918,5000,1.0,100.0,fashion,Asia,31676.796798


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   product_id        500 non-null    int64  
 1   marketing_spend   496 non-null    float64
 2   discount_percent  500 non-null    float64
 3   website_traffic   415 non-null    object 
 4   customer_rating   414 non-null    float64
 5   num_reviews       400 non-null    float64
 6   product_category  447 non-null    object 
 7   region            415 non-null    object 
 8   weekly_sales      500 non-null    float64
dtypes: float64(5), int64(1), object(3)
memory usage: 35.3+ KB


In [6]:
df.isnull().sum()

product_id            0
marketing_spend       4
discount_percent      0
website_traffic      85
customer_rating      86
num_reviews         100
product_category     53
region               85
weekly_sales          0
dtype: int64

In [7]:
df['marketing_spend'].describe()

count      496.000000
mean     11989.067456
std       7305.989827
min        670.327198
25%       5028.934160
50%      11982.185729
75%      18286.745018
max      24767.375979
Name: marketing_spend, dtype: float64

In [8]:
df['marketing_spend'].fillna(df['marketing_spend'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['marketing_spend'].fillna(df['marketing_spend'].mean(), inplace=True)


In [10]:
df.isnull().sum()

product_id            0
marketing_spend       0
discount_percent      0
website_traffic      85
customer_rating      86
num_reviews         100
product_category     53
region               85
weekly_sales          0
dtype: int64

#### 1.converting numerical column with text like data into float like replacing commas if present which makes dataype as object

In [19]:
for col in ['website_traffic','num_reviews']:
    df[col]=df[col].replace('[,]','',regex=True).astype(float)

In [25]:
df['product_category'].value_counts(dropna=False)

product_category
home           79
fashion        70
Electronics    67
Home           66
Beauty         59
beauty         55
NaN            53
FASHION        51
Name: count, dtype: int64

In [26]:
df['region'].value_counts(dropna=False)

region
US      88
usa     87
NaN     85
EU      80
ASIA    80
Asia    80
Name: count, dtype: int64

#### 2.Normalizing category text4

In [27]:
df['product_category']=df['product_category'].str.lower()

In [28]:
df['product_category'].value_counts(dropna=False)

product_category
home           145
fashion        121
beauty         114
electronics     67
NaN             53
Name: count, dtype: int64

In [29]:
df['product_category'].fillna('unknown',inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['product_category'].fillna('unknown',inplace=True)


In [30]:
df['region'].value_counts(dropna=False)

region
US      88
usa     87
NaN     85
EU      80
ASIA    80
Asia    80
Name: count, dtype: int64

In [31]:
df['region']=df['region'].str.lower().replace({'us':'us','usa':'us'})

In [32]:
df['region'].value_counts(dropna=False)

region
us      175
asia    160
NaN      85
eu       80
Name: count, dtype: int64

In [33]:
df['region'].fillna('unknown',inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['region'].fillna('unknown',inplace=True)


In [35]:
df.isnull().sum()

product_id            0
marketing_spend       0
discount_percent      0
website_traffic      85
customer_rating      86
num_reviews         100
product_category      0
region                0
weekly_sales          0
dtype: int64

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   product_id        500 non-null    int64  
 1   marketing_spend   500 non-null    float64
 2   discount_percent  500 non-null    float64
 3   website_traffic   415 non-null    float64
 4   customer_rating   414 non-null    float64
 5   num_reviews       400 non-null    float64
 6   product_category  500 non-null    object 
 7   region            500 non-null    object 
 8   weekly_sales      500 non-null    float64
dtypes: float64(6), int64(1), object(2)
memory usage: 35.3+ KB


##### Step 3: Split features and target

In [42]:
df.columns

Index(['product_id', 'marketing_spend', 'discount_percent', 'website_traffic',
       'customer_rating', 'num_reviews', 'product_category', 'region',
       'weekly_sales'],
      dtype='object')

In [43]:
X=df.drop(['product_id','weekly_sales'],axis=1)
y=df['weekly_sales']

In [44]:
numeric_features=['marketing_spend', 'discount_percent', 'website_traffic',
       'customer_rating', 'num_reviews']
categorical_features=['product_category', 'region']

In [None]:
# filling missing values from numerica data with median
# filling missing values from categorical data with mode

In [45]:
num_imputer=SimpleImputer(strategy='median')
X[numeric_features]=num_imputer.fit_transform(X[numeric_features])

In [50]:
cat_imputer=SimpleImputer(strategy='most_frequent')
X[categorical_features]=cat_imputer.fit_transform(X[categorical_features])

##### STEP 4: Encode Categorical Featur

In [51]:
encoder=OneHotEncoder(drop='first', sparse_output=False)
encoder_cat=encoder.fit_transform(X[categorical_features])

In [53]:
encoder_df=pd.DataFrame(encoder_cat,columns=encoder.get_feature_names_out(categorical_features),index=X.index)

In [54]:
encoder_df

Unnamed: 0,product_category_electronics,product_category_fashion,product_category_home,product_category_unknown,region_eu,region_unknown,region_us
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...
495,0.0,0.0,0.0,0.0,0.0,1.0,0.0
496,1.0,0.0,0.0,0.0,0.0,0.0,1.0
497,0.0,1.0,0.0,0.0,0.0,0.0,1.0
498,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [59]:
X=pd.concat([X.drop(categorical_features, axis=1), encoder_df], axis=1)

In [60]:
X

Unnamed: 0,marketing_spend,discount_percent,website_traffic,customer_rating,num_reviews,product_category_electronics,product_category_fashion,product_category_home,product_category_unknown,region_eu,region_unknown,region_us
0,1879.430254,19.886650,12000.0,5.0,1000.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,10035.441129,40.285205,30000.0,3.0,1000.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,24648.436126,33.049329,40000.0,4.0,3000.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,946.434728,26.633026,30000.0,3.0,1000.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,1923.048498,52.485694,5000.0,2.0,2000.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
495,9509.037175,39.473036,12000.0,1.0,1000.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
496,9718.359846,6.161023,5000.0,2.0,1000.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
497,3379.393046,56.791859,70000.0,5.0,500.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
498,8412.433587,24.120918,5000.0,1.0,100.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


###### Standerdization of numeric columns

In [61]:
scalar=StandardScaler()
X[numeric_features]=scalar.fit_transform(X[numeric_features])

In [63]:
X

Unnamed: 0,marketing_spend,discount_percent,website_traffic,customer_rating,num_reviews,product_category_electronics,product_category_fashion,product_category_home,product_category_unknown,region_eu,region_unknown,region_us
0,-1.390717,-0.968190,-0.883674,1.486547,-0.299218,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-0.268748,0.107844,-0.029974,-0.044397,-0.299218,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1.741468,-0.273852,0.444303,0.721075,1.750220,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-1.519064,-0.612315,-0.029974,-0.044397,-0.299218,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,-1.384717,0.751427,-1.215668,-0.809870,0.725501,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
495,-0.341162,0.065002,-0.883674,-1.575342,-0.299218,0.0,0.0,0.0,0.0,0.0,1.0,0.0
496,-0.312367,-1.692224,-1.215668,-0.809870,-0.299218,1.0,0.0,0.0,0.0,0.0,0.0,1.0
497,-1.184377,0.978579,1.867136,1.486547,-0.811578,0.0,1.0,0.0,0.0,0.0,0.0,1.0
498,-0.492014,-0.744830,-1.215668,-1.575342,-1.221465,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [69]:
X_train, X_test,y_train, y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [70]:
model=LinearRegression()

In [71]:
model.fit(X_train,y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [72]:
y_pred=model.predict(X_test)

#### Evaluation Matrix

In [74]:
mea=mean_absolute_error(y_test,y_pred)
mse=mean_squared_error(y_test,y_pred)
r2=r2_score(y_test, y_pred)

In [78]:
print("\nðŸ“Š MODEL PERFORMANCE")
print(f"Mean Absolute Error (mea): {mea:,.2f}")
print(f"Root Mean Squared Error (mse): {mse:,.2f}")
print(f"RÂ² Score: {r2:.3f}")


ðŸ“Š MODEL PERFORMANCE
Mean Absolute Error (mea): 9,665.77
Root Mean Squared Error (mse): 141,821,852.61
RÂ² Score: 0.795


##### interpreting coefficient

In [79]:
importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_
}).sort_values(by='Coefficient', ascending=False)

print("\nðŸ’¡ FEATURE IMPORTANCE (Top 10)")
print(importance_df.head(10))



ðŸ’¡ FEATURE IMPORTANCE (Top 10)
                         Feature   Coefficient
0                marketing_spend  22503.052012
5   product_category_electronics   9362.434327
10                region_unknown   1153.526011
1               discount_percent    539.913035
2                website_traffic    304.149647
11                     region_us    216.188127
4                    num_reviews   -272.179381
3                customer_rating   -321.964220
7          product_category_home   -513.906787
6       product_category_fashion  -1114.592394
