In [None]:
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.stats.diagnostic as diag
from sklearn.model_selection import cross_val_score, train_test_split
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, make_scorer, r2_score
import sklearn.preprocessing
from scipy.stats import normaltest
from sklearn.linear_model import Ridge
from yellowbrick.regressor import ResidualsPlot
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor


from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score ,precision_score,recall_score,f1_score

np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning) 

In [None]:
# Global Variables
seed = random.seed(123)
number_clusters = 3

# Analisis Exploratorio

In [None]:
# Variable classification
train_data = pd.read_csv('./data/train.csv', encoding = "ISO-8859-1")
test_data = pd.read_csv('./data/test.csv', encoding = "ISO-8859-1")
variables = pd.read_csv('./data/variables.txt', encoding = "ISO-8859-1")
quant_vars = list(variables.loc[(variables['Clasification'] == 'Cuantitativa')]['Variable'].values)
quali_vars = list(variables.loc[(variables['Clasification'] == 'Cualitativa')]['Variable'].values)[1:]

### Analizando las variables numericas

In [None]:
train_data[quant_vars].describe()

In [None]:
for var in quant_vars:
    data = train_data[var].dropna(how='all', axis=0)
    
    # Gráfico
    sns.displot(data, kde=True)

    # Mostrando normalidad
    print('\033[1m' + var + '\033[0m' + ': Kurtosis:', stats.kurtosis(data), 'Skewness:', stats.skew(data), '\n')

### Analizando las variables categoricas

In [None]:
for var in quali_vars:
  plt.figure(figsize=(20,5))
  train_data[var].value_counts().plot(kind='bar')
  plt.show()

### Analizando la variable de interes

In [None]:
#skewness and kurtosis
print('Skewness: %f' % train_data['SalePrice'].skew())
print('Kurtosis: %f' % train_data['SalePrice'].kurt())
print('\n---Describe---')
train_data['SalePrice'].describe([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.65, 0.7, 0.8, 0.9, 0.95])

In [None]:
stat,p = stats.shapiro(train_data[["SalePrice"]].dropna())
print('Kolmogorov-Smirnov:\np=%f\n'% p)
ks_statistic, p_value = diag.lilliefors(train_data[["SalePrice"]].dropna())
print('Lilliefors:\nks=%f\np=%f'%(ks_statistic,p_value))

In [None]:
sns.displot(train_data['SalePrice'], kde=True)

### Correlacion

In [None]:
k = 10 #number of variables for heatmap
corrmat = train_data.corr()
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(train_data[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

### Obteniendo la relacion entre las variables mas significativas

In [None]:
sns.pairplot(train_data[cols],hue="SalePrice")
plt.show()
# quant_vars

# Analizando data

In [None]:
def categorize(row): 
    if row['SalePrice'] > 0 and row['SalePrice'] <= 179280:
        return 'Low'
    elif row['SalePrice'] > 179280 and row['SalePrice'] < 326100:
        return 'Medium'
    else:
        return 'Expensive'
   
train_data['SalesCategories'] = train_data.apply(lambda row: categorize(row), axis=1)


print(train_data.groupby('SalesCategories').size().sort_values(ascending=False))


In [None]:
copied_train_data = train_data.copy()
copied_train_data = copied_train_data.fillna(0)


y = copied_train_data.pop("SalesCategories") #La variable respuesta
X = copied_train_data[quant_vars] #El resto de los datos

70% de entrenamiento y 30% prueba

In [None]:
X_train, X_test,y_train, y_test = train_test_split(X, y,test_size=0.3,train_size=0.7)

## Creando el modelo

In [None]:
gaussian = GaussianNB()
gaussian.fit(X_train,y_train)
y_pred = gaussian.predict(X_test)
cm = confusion_matrix(y_test,y_pred)

#### Resultados esperados

In [None]:
pred = list(y_pred)
print('Low:', pred.count('Low'))
print('Medium:', pred.count('Medium'))
print('Expensive:', pred.count('Expensive'))


#### Matriz de confusion

In [None]:
cm = list(cm)
labels = ['Exp', 'Low', 'Med']

print('    '+labels[0]+'  '+labels[1]+'  '+labels[2])
print(labels[0], cm[0][0], ' ',cm[0][1], '  ', cm[0][2])
print(labels[1], cm[1][0], '  ',cm[1][1], '', cm[1][2])
print(labels[2], cm[2][0], ' ',cm[2][1], ' ', cm[2][2])

#### Suma por columna de la matriz de confusion

In [None]:
for i in range(3):
    print(labels[i], ':', sum(row[i] for row in cm))

#### Resultado

In [None]:
print ("Accuracy:", accuracy_score(y_test, y_pred))
print ("Precision:", precision_score(y_test,y_pred,average='micro') )
print ("Recall: ", recall_score(y_test,y_pred,average='micro'))