# Linear Regression in Python 

In [2]:
# Linear Regression Using Statsmodels.api 
import statsmodels.api as sm
import pandas as pd

# Data Preprocessing

In [3]:
# Reading the Data
data = pd.read_csv("insurance.csv", delimiter = ",")

In [4]:
print(data.head(n=5))

   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520


In [11]:
# Getting the data ready for the models
data = pd.DataFrame(data) 
predictors = data.iloc[:,:6] #all columns except charges 
y = data.iloc[:,-1] #charges, response variable
df = data.copy() #copying the data 

# Label encoding the data - from categorical to numerical 
object_df = data.select_dtypes(include=['object']).copy()
object_df["sex"] = object_df["sex"].astype('category')
object_df["smoker"] = object_df["smoker"].astype('category')
object_df["region"] = object_df["region"].astype('category') 

object_df["sex_binary"] = object_df["sex"].cat.codes
object_df["smoker_binary"] = object_df["smoker"].cat.codes
object_df["region_encoded"] = object_df["region"].cat.codes


#changing the columns in the data
df["sex"] = object_df["sex_binary"]
df["smoker"] = object_df["smoker_binary"]
df["region"] = object_df["region_encoded"] 


print(df.head(n=5))

   age  sex     bmi  children  smoker  region      charges
0   19    0  27.900         0       1       3  16884.92400
1   18    1  33.770         1       0       2   1725.55230
2   28    1  33.000         3       0       2   4449.46200
3   33    1  22.705         0       0       1  21984.47061
4   32    1  28.880         0       0       1   3866.85520


# Modeling

In [13]:
# Models 
predictors = df.iloc[:,:6] #all columns except charges - using the new dataframe, 'df' 
for i in predictors: 
    X = predictors[i] 
    X = sm.add_constant(X) #adding a constant - adding an intercept otherwise we would get the wrong model
    model = sm.OLS(y,X) 
    results = model.fit() 
    print(f"{i} vs. charges: ", results.summary())

age vs. charges:                              OLS Regression Results                            
Dep. Variable:                charges   R-squared:                       0.089
Model:                            OLS   Adj. R-squared:                  0.089
Method:                 Least Squares   F-statistic:                     131.2
Date:                Tue, 17 Nov 2020   Prob (F-statistic):           4.89e-29
Time:                        19:11:13   Log-Likelihood:                -14415.
No. Observations:                1338   AIC:                         2.883e+04
Df Residuals:                    1336   BIC:                         2.884e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       3165.8850    937.149  