**Linear Regression**

In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import seaborn as sns

In [36]:
mng_df = pd.read_csv('/Users/ana/Desktop/marketing.csv') 

In [37]:
categorical_df = mng_df.select_dtypes(include = 'category')

In [38]:
categoricals_encoded = pd.get_dummies(mng_df, columns=categorical_df, drop_first=True, dtype = int)

In [39]:
numerical_df = mng_df.select_dtypes(include = 'number')

In [40]:
print(numerical_df)

       unnamed:_0  customer_lifetime_value  income  monthly_premium_auto  \
0               0              4809.216960   48029                    61   
1               1              2228.525238       0                    64   
2               2             14947.917300   22139                   100   
3               3             22332.439460   49078                    97   
4               4              9025.067525   23675                   117   
...           ...                      ...     ...                   ...   
10905       10905             15563.369440       0                   253   
10906       10906              5259.444853   61146                    65   
10907       10907             23893.304100   39837                   201   
10908       10908             11971.977650   64195                   158   
10909       10909              6857.519928       0                   101   

       months_since_last_claim  months_since_policy_inception  \
0                     

In [41]:
numerical_df

Unnamed: 0,unnamed:_0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,total_claim_amount,month
0,0,4809.216960,48029,61,7.000000,52,0.000000,9,292.800000,2
1,1,2228.525238,0,64,3.000000,26,0.000000,1,744.924331,1
2,2,14947.917300,22139,100,34.000000,31,0.000000,2,480.000000,2
3,3,22332.439460,49078,97,10.000000,3,0.000000,2,484.013411,1
4,4,9025.067525,23675,117,15.149071,31,0.384256,7,707.925645,1
...,...,...,...,...,...,...,...,...,...,...
10905,10905,15563.369440,0,253,15.149071,40,0.384256,7,1214.400000,1
10906,10906,5259.444853,61146,65,7.000000,68,0.000000,6,273.018929,1
10907,10907,23893.304100,39837,201,11.000000,63,0.000000,2,381.306996,2
10908,10908,11971.977650,64195,158,0.000000,27,4.000000,6,618.288849,2



**X-y split (y is the target variable, in this case, "total claim amount")**
****Train-test split.**


In [42]:
target = 'total_claim_amount'
X = numerical_df.drop(target, axis=1)
y = numerical_df[target]

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [44]:
print(X_train)
print(X_test)
print(y_train)
print(y_test)

      unnamed:_0  customer_lifetime_value  income  monthly_premium_auto  \
3105        3105              4665.129599       0                    62   
6032        6032             10288.924950   96337                   127   
157          157              4873.436612   18866                   126   
6964        6964              6944.739992       0                    68   
6349        6349              2472.469209   63860                    62   
...          ...                      ...     ...                   ...   
5734        5734              3810.238281       0                   108   
5191        5191              3815.851163   38651                    98   
5390        5390              7850.590399       0                    69   
860          860              4974.235309       0                    70   
7270        7270             38055.209530       0                   115   

      months_since_last_claim  months_since_policy_inception  \
3105                     26.0      

**Standardize the data (after the data split!).**


In [45]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

**Apply linear regression.**


In [46]:
model = LinearRegression()
model.fit(X_train_scaled, y_train)

**Model Interpretation**

In [47]:
y_pred = model.predict(X_test_scaled)

In [48]:
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 39823.336182275554


In [51]:
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

Coefficients: [   1.57955423   -4.79937798 -100.89896777  188.1771097     1.19168905
   -4.35232155   -1.33391994    0.61759542   -0.63218299]
Intercept: 435.0824136783915


**Prediction**

In [53]:
random_customer = X_test.sample()

In [54]:
print(random_customer)

     unnamed:_0  customer_lifetime_value  income  monthly_premium_auto  \
900         900              13295.60284   68009                   110   

     months_since_last_claim  months_since_policy_inception  \
900                      1.0                              2   

     number_of_open_complaints  number_of_policies  month  
900                        0.0                   2      2  


In [55]:
model.predict(random_customer)



array([-6903299.90173723])