## Modulos

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std
np.set_printoptions(formatter={'float_kind':"{:.2f}".format})

## Datos

In [None]:
df = pd.read_csv("/content/50_Startups.csv")
X= df.iloc[ : ,:-1].values
y= df.iloc[:,4].values
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [None]:
X_opt

[[1.0, 0.0, 1.0, 165349.2, 136897.8, 471784.1],
 [1.0, 0.0, 0.0, 162597.7, 151377.59, 443898.53],
 [1.0, 1.0, 0.0, 153441.51, 101145.55, 407934.54],
 [1.0, 0.0, 1.0, 144372.41, 118671.85, 383199.62],
 [1.0, 1.0, 0.0, 142107.34, 91391.77, 366168.42],
 [1.0, 0.0, 1.0, 131876.9, 99814.71, 362861.36],
 [1.0, 0.0, 0.0, 134615.46, 147198.87, 127716.82],
 [1.0, 1.0, 0.0, 130298.13, 145530.06, 323876.68],
 [1.0, 0.0, 1.0, 120542.52, 148718.95, 311613.29],
 [1.0, 0.0, 0.0, 123334.88, 108679.17, 304981.62],
 [1.0, 1.0, 0.0, 101913.08, 110594.11, 229160.95],
 [1.0, 0.0, 0.0, 100671.96, 91790.61, 249744.55],
 [1.0, 1.0, 0.0, 93863.75, 127320.38, 249839.44],
 [1.0, 0.0, 0.0, 91992.39, 135495.07, 252664.93],
 [1.0, 1.0, 0.0, 119943.24, 156547.42, 256512.92],
 [1.0, 0.0, 1.0, 114523.61, 122616.84, 261776.23],
 [1.0, 0.0, 0.0, 78013.11, 121597.55, 264346.06],
 [1.0, 0.0, 1.0, 94657.16, 145077.58, 282574.31],
 [1.0, 1.0, 0.0, 91749.16, 114175.79, 294919.57],
 [1.0, 0.0, 1.0, 86419.7, 153514.11, 0.0],
 

## Preprocesamiento

In [None]:
ct= ColumnTransformer([('State',OneHotEncoder(categories='auto'),[3])], remainder='passthrough')
X= ct.fit_transform(X).astype(float)

In [None]:
X = X[:,1:]

In [None]:
X

array([[0.00, 1.00, 165349.20, 136897.80, 471784.10],
       [0.00, 0.00, 162597.70, 151377.59, 443898.53],
       [1.00, 0.00, 153441.51, 101145.55, 407934.54],
       [0.00, 1.00, 144372.41, 118671.85, 383199.62],
       [1.00, 0.00, 142107.34, 91391.77, 366168.42],
       [0.00, 1.00, 131876.90, 99814.71, 362861.36],
       [0.00, 0.00, 134615.46, 147198.87, 127716.82],
       [1.00, 0.00, 130298.13, 145530.06, 323876.68],
       [0.00, 1.00, 120542.52, 148718.95, 311613.29],
       [0.00, 0.00, 123334.88, 108679.17, 304981.62],
       [1.00, 0.00, 101913.08, 110594.11, 229160.95],
       [0.00, 0.00, 100671.96, 91790.61, 249744.55],
       [1.00, 0.00, 93863.75, 127320.38, 249839.44],
       [0.00, 0.00, 91992.39, 135495.07, 252664.93],
       [1.00, 0.00, 119943.24, 156547.42, 256512.92],
       [0.00, 1.00, 114523.61, 122616.84, 261776.23],
       [0.00, 0.00, 78013.11, 121597.55, 264346.06],
       [0.00, 1.00, 94657.16, 145077.58, 282574.31],
       [1.00, 0.00, 91749.16, 11417

### Train-test 

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=.2,random_state=0)

In [None]:
len(x_train), len(x_test)

(40, 10)

## Ajustar modelo

### Modelo simple

In [None]:
regresion= LinearRegression()
regresion.fit(x_train,y_train)

LinearRegression()

In [None]:
y_pred= regresion.predict(x_test)

#### RSS

In [None]:
np.sum((y_test-y_pred)**2)

835028640.3250548

### Variables optimas

In [None]:
X= np.append(arr=np.ones((50,1)).astype(int),axis=1,values=X)

In [None]:
sl=0.05
X_opt= X[:,[0,1,2,3,4,5]].tolist()

In [None]:
regresion_OLS = sm.OLS(endog=y,exog=X_opt).fit()
regresion_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,169.9
Date:,"Mon, 12 Sep 2022",Prob (F-statistic):,1.34e-27
Time:,23:42:08,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1063.0
Df Residuals:,44,BIC:,1074.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.013e+04,6884.820,7.281,0.000,3.62e+04,6.4e+04
x1,198.7888,3371.007,0.059,0.953,-6595.030,6992.607
x2,-41.8870,3256.039,-0.013,0.990,-6604.003,6520.229
x3,0.8060,0.046,17.369,0.000,0.712,0.900
x4,-0.0270,0.052,-0.517,0.608,-0.132,0.078
x5,0.0270,0.017,1.574,0.123,-0.008,0.062

0,1,2,3
Omnibus:,14.782,Durbin-Watson:,1.283
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.266
Skew:,-0.948,Prob(JB):,2.41e-05
Kurtosis:,5.572,Cond. No.,1450000.0


### Eliminaciòn por p_value mayor

In [None]:
X_opt= X[:,[0,1,3,4,5]].tolist()
regresion_OLS = sm.OLS(endog=y,exog=X_opt).fit()
regresion_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,217.2
Date:,"Mon, 12 Sep 2022",Prob (F-statistic):,8.49e-29
Time:,17:01:41,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1061.0
Df Residuals:,45,BIC:,1070.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.011e+04,6647.870,7.537,0.000,3.67e+04,6.35e+04
x1,220.1585,2900.536,0.076,0.940,-5621.821,6062.138
x2,0.8060,0.046,17.606,0.000,0.714,0.898
x3,-0.0270,0.052,-0.523,0.604,-0.131,0.077
x4,0.0270,0.017,1.592,0.118,-0.007,0.061

0,1,2,3
Omnibus:,14.758,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.172
Skew:,-0.948,Prob(JB):,2.53e-05
Kurtosis:,5.563,Cond. No.,1400000.0


In [None]:
X_opt= X[:,[0,3,4,5]].tolist()
regresion_OLS = sm.OLS(endog=y,exog=X_opt).fit()
regresion_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,296.0
Date:,"Mon, 12 Sep 2022",Prob (F-statistic):,4.53e-30
Time:,17:04:14,Log-Likelihood:,-525.39
No. Observations:,50,AIC:,1059.0
Df Residuals:,46,BIC:,1066.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.012e+04,6572.353,7.626,0.000,3.69e+04,6.34e+04
x1,0.8057,0.045,17.846,0.000,0.715,0.897
x2,-0.0268,0.051,-0.526,0.602,-0.130,0.076
x3,0.0272,0.016,1.655,0.105,-0.006,0.060

0,1,2,3
Omnibus:,14.838,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.442
Skew:,-0.949,Prob(JB):,2.21e-05
Kurtosis:,5.586,Cond. No.,1400000.0


In [None]:
X_opt=X[:,[0,3,5]].tolist()
regresion_OLS=sm.OLS(endog=y,exog=X_opt).fit()
regresion_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,450.8
Date:,"Mon, 12 Sep 2022",Prob (F-statistic):,2.1600000000000003e-31
Time:,17:07:24,Log-Likelihood:,-525.54
No. Observations:,50,AIC:,1057.0
Df Residuals:,47,BIC:,1063.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.698e+04,2689.933,17.464,0.000,4.16e+04,5.24e+04
x1,0.7966,0.041,19.266,0.000,0.713,0.880
x2,0.0299,0.016,1.927,0.060,-0.001,0.061

0,1,2,3
Omnibus:,14.677,Durbin-Watson:,1.257
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.161
Skew:,-0.939,Prob(JB):,2.54e-05
Kurtosis:,5.575,Cond. No.,532000.0
