#### __Problem Statement__
The company is tryng to decide whether to focus their efforts on their mobile app experience or their website

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.offline import iplot

from sklearn import linear_model 
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, KFold, cross_val_score


import warnings

pd.set_option('future.no_silent_downcasting', True)
pd.options.mode.copy_on_write = "warn"

### __About Dataset__
* `Avatar`: This column represent a Avatar Color chosen by the customer.¶
* `Avg. Session Length`: the average duration of sessions (in Minutes) of Mobile and Website.
* `Time on App`: the total amount of time (in Minutes) that a customer spends using the mobile App application.
* `Time on Website`: the total amount of time (in minutes) that a customer spends on the website.
* `Length of Membership`: the duration of membership or loyalty of each customer (in Months)
* `Yearly Amount Spent`:the total amount of money spent by each customer on the company's products Via an year.

In [2]:
df = pd.read_csv('./Ecommerce Customers.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Email                 500 non-null    object 
 1   Address               500 non-null    object 
 2   Avatar                500 non-null    object 
 3   Avg. Session Length   500 non-null    float64
 4   Time on App           500 non-null    float64
 5   Time on Website       500 non-null    float64
 6   Length of Membership  500 non-null    float64
 7   Yearly Amount Spent   500 non-null    float64
dtypes: float64(5), object(3)
memory usage: 31.4+ KB


In [4]:
df.sample(5, random_state=5)

Unnamed: 0,Email,Address,Avatar,Avg. Session Length,Time on App,Time on Website,Length of Membership,Yearly Amount Spent
241,karenosborne@yahoo.com,"81814 Pratt Squares Suite 460\nNorth Robert, G...",SlateBlue,32.686245,12.638572,36.097221,4.297737,571.471034
448,flevine@gmail.com,5292 Melanie Crescent Apt. 064\nFischerborough...,AliceBlue,32.204655,12.480702,37.680288,3.279466,478.584286
75,langmatthew@hotmail.com,"606 Perez Drives\nMaryside, CO 94387-5877",DimGray,32.049839,12.238057,38.730862,3.120569,478.719357
212,baldwinbryan@estrada-silva.biz,"1470 Kathleen Pass\nSouth Christopherberg, SD ...",MediumSeaGreen,33.304431,12.37849,38.764297,3.843849,536.130897
481,autumn88@mendoza-mills.com,"214 Obrien Lakes Suite 572\nSouth Jeremy, KS 5...",MediumOrchid,32.047815,12.48267,35.536025,3.393903,497.389558


__We can see that users spend more time on the website than on the mobile app__

__Hipothesis : We can say that he more time users spend on the website, the more money they spend throughout the year. But we need to figure it out on that Hipothesis__

### __Data Cleaning & Wrangling__

In [5]:
# Describe Categorical Data
df.select_dtypes(include='object').describe()

Unnamed: 0,Email,Address,Avatar
count,500,500,500
unique,500,500,138
top,mstephenson@fernandez.com,"835 Frank Tunnel\nWrightmouth, MI 82180-9605",SlateBlue
freq,1,1,7


In [6]:
# Describe Numerical Data
np.round(df.describe().T, 2)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Avg. Session Length,500.0,33.05,0.99,29.53,32.34,33.08,33.71,36.14
Time on App,500.0,12.05,0.99,8.51,11.39,11.98,12.75,15.13
Time on Website,500.0,37.06,1.01,33.91,36.35,37.07,37.72,40.01
Length of Membership,500.0,3.53,1.0,0.27,2.93,3.53,4.13,6.92
Yearly Amount Spent,500.0,499.31,79.31,256.67,445.04,498.89,549.31,765.52


In [7]:
# Clean the columns' name from any spaces
df.columns = df.columns.str.replace(' ', '_').str.replace('.','')

In [8]:
df.rename(columns={
    'Time_on_App':'App_Usage',
    'Time_on_Website' : 'Website_Usage',
    'Length_of_Membership' : 'Membership_Length',
    'Yearly_Amount_Spent' : 'Yearly_Spent'}, inplace=True)

In [9]:
df.head()

Unnamed: 0,Email,Address,Avatar,Avg_Session_Length,App_Usage,Website_Usage,Membership_Length,Yearly_Spent
0,mstephenson@fernandez.com,"835 Frank Tunnel\nWrightmouth, MI 82180-9605",Violet,34.497268,12.655651,39.577668,4.082621,587.951054
1,hduke@hotmail.com,"4547 Archer Common\nDiazchester, CA 06566-8576",DarkGreen,31.926272,11.109461,37.268959,2.664034,392.204933
2,pallen@yahoo.com,"24645 Valerie Unions Suite 582\nCobbborough, D...",Bisque,33.000915,11.330278,37.110597,4.104543,487.547505
3,riverarebecca@gmail.com,"1414 David Throughway\nPort Jason, OH 22070-1220",SaddleBrown,34.305557,13.717514,36.721283,3.120179,581.852344
4,mstephens@davidson-herman.com,"14023 Rodriguez Passage\nPort Jacobville, PR 3...",MediumAquaMarine,33.330673,12.795189,37.536653,4.446308,599.406092


### __Correlation Heatmap & Charts__

In [10]:
corr = df.corr(numeric_only=True)

fig = px.imshow(
    corr,
    template='plotly_dark',
    text_auto='0.2f',
    aspect=1,
    color_continuous_scale='orrd',
    title= 'Correlation Between Data'
)

fig.update_traces(
    textfont = {
        'size' : 16,
        'family' : 'consolas'
    }
)

fig.update_layout(
    title = {
        'font' : {
            'size' : 28,
            'family' : '<b>poppins'
        }
    }
)
iplot(fig)

In [30]:
import plotly.graph_objects as go

fig = px.scatter_matrix(
    df,
    dimensions= df.select_dtypes(include='number').columns,
    height=950,
    width=900,
    color='Yearly_Spent',
    opacity= .70,
    title= 'Relationships Between Numerical Data',
    template= 'plotly_dark'
)

fig.update_layout(
    title= {
        'font' : {
            'size' : 28,
            'family' : '<b>poppins'
        }
    }
)

iplot(fig)

__There is no correlation between `Yearly_Spent`and `Website_Usage`time__

### __Multiple Linear Regression Equation__

In [38]:
X = df.iloc[:, 3:7]
y = df.iloc[:, 7:8]

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20, random_state=38)

In [57]:
X_train.head()

Unnamed: 0,Avg_Session_Length,App_Usage,Website_Usage,Membership_Length
482,30.971676,11.731364,36.074551,4.426364
260,35.039283,14.426491,37.374184,3.930615
46,34.564558,13.146551,37.335446,3.876875
334,31.97648,10.757131,36.595868,1.977007
469,31.169507,13.970181,36.673953,1.785174


### __Build the Model__

In [58]:
model = linear_model.LinearRegression()

In [61]:
model.fit(X_train, y_train)

In [62]:
train_score = model.score(X_train, y_train)
print(f'Train Score: {train_score * 100:.2f}%')

Train Score: 98.46%


### __Predict the Data__

In [63]:
model.score(X_train, y_train)

0.984604163566385