In [None]:
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.stats.diagnostic as diag
from sklearn.model_selection import cross_val_score, train_test_split
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, make_scorer, r2_score
import sklearn.preprocessing
from scipy.stats import normaltest
from sklearn.linear_model import Ridge
from yellowbrick.regressor import ResidualsPlot
import statsmodels.api as sm

np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning) 

In [None]:
# Global Variables
seed = random.seed(123)
number_clusters = 3

# Analisis Exploratorio

In [None]:
# Variable classification
train_data = pd.read_csv('./data/train.csv', encoding = "ISO-8859-1")
test_data = pd.read_csv('./data/test.csv', encoding = "ISO-8859-1")
variables = pd.read_csv('./data/variables.txt', encoding = "ISO-8859-1")
quant_vars = list(variables.loc[(variables['Clasification'] == 'Cuantitativa')]['Variable'].values)
quali_vars = list(variables.loc[(variables['Clasification'] == 'Cualitativa')]['Variable'].values)[1:]

### Analizando las variables numericas

In [None]:
train_data[quant_vars].describe()

In [None]:
for var in quant_vars:
    data = train_data[var].dropna(how='all', axis=0)
    
    # Gráfico
    sns.displot(data, kde=True)

    # Mostrando normalidad
    print('\033[1m' + var + '\033[0m' + ': Kurtosis:', stats.kurtosis(data), 'Skewness:', stats.skew(data), '\n')

### Analizando las variables categoricas

In [None]:
for var in quali_vars:
  plt.figure(figsize=(20,5))
  train_data[var].value_counts().plot(kind='bar')
  plt.show()

### Analizando la variable de interes

In [None]:
#skewness and kurtosis
print('Skewness: %f' % train_data['SalePrice'].skew())
print('Kurtosis: %f' % train_data['SalePrice'].kurt())
print('\n---Describe---')
train_data['SalePrice'].describe([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.65, 0.7, 0.8, 0.9, 0.95])

In [None]:
stat,p = stats.shapiro(train_data[["SalePrice"]].dropna())
print('Kolmogorov-Smirnov:\np=%f\n'% p)
ks_statistic, p_value = diag.lilliefors(train_data[["SalePrice"]].dropna())
print('Lilliefors:\nks=%f\np=%f'%(ks_statistic,p_value))

In [None]:
sns.displot(train_data['SalePrice'], kde=True)

## Modelo de regresión lineal

In [None]:
plt.figure(figsize=(20,5))
plt.scatter(train_data.SalePrice, train_data.GrLivArea, c = "blue", marker = "s")
plt.title("Looking for outliers")
plt.ylabel("GrLivArea")
plt.xlabel("SalePrice")
plt.show()


In [None]:
# Log transform the target for official scoring
train = train_data.copy()
y = train.pop("SalePrice") #La variable respuesta
X = train #El resto de los datos

In [None]:
X_train, X_test,y_train, y_test = train_test_split(X, y,test_size=0.3,train_size=0.7)

In [None]:
s_price = y_train.values.reshape(-1,1)
s_price_t = y_test.values.reshape(-1,1)
gr_area = X_train['GrLivArea'].values.reshape(-1,1)
gr_area_t = X_test['GrLivArea'].values.reshape(-1,1)
lm = LinearRegression()
lm.fit(gr_area, s_price)
s_price_pred = lm.predict(gr_area_t)

In [None]:
#y = mx + c
m = lm.coef_[0][0]
c = lm.intercept_[0]

label = r'$SalePrice = %0.4f*GrLiveArea %+0.4f$'%(m,c)
print(label)

In [None]:
fig = plt.figure()
plt.scatter(s_price_t, gr_area_t)
plt.plot(s_price_pred, gr_area_t, color="blue")
plt.xlabel("Sale Price")
plt.ylabel("GrLiveArea")
plt.title("Test Set Sale Price vs GrLiveArea")

In [None]:
print("Mean Squared Error: %.2f"%mean_squared_error(s_price_t,s_price_pred))
print("R squared: %.2f"%r2_score(s_price_t,s_price_pred))

# Analisis de variables

### Residuales

In [None]:
# Lo que enverdad vale de un 30% de los datos - lo que predecimos que cuesta
residuales = s_price_t - s_price_pred
len(residuales)


In [None]:
plt.plot(gr_area_t,residuales, 'o', color='darkblue')
plt.title("Gráfico de Residuales")
plt.xlabel("Variable independiente")
plt.ylabel("Residuales")

In [None]:
plt.figure(figsize=(20,5))
sns.distplot(residuales);
plt.title("Residuales")

In [None]:
plt.figure(figsize=(10,10))
plt.boxplot(residuales)

In [None]:
normaltest(residuales)

In [None]:
model = Ridge()
visualizer = ResidualsPlot(model)
visualizer.fit(s_price,gr_area)
visualizer.score(s_price_t,gr_area_t)

Si los coeficientes de la variable independiente son 0, esta no es un buen predictor.  
**H0:** Los coeficientes son igual a 0  
**H1:** Los coeficientes son distintos de 0

In [None]:
est = sm.OLS(s_price,gr_area)
est2 = est.fit()
print(est2.summary())

### Correlacion

In [None]:
k = 10 #number of variables for heatmap
corrmat = train_data.corr()
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(train_data[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()