In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib as mpl
from matplotlib import pyplot as plt
import datetime as dt
from sklearn.model_selection import train_test_split

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
df = pd.read_csv("../project_5/csv/df_jan_clean.csv")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 795 entries, 0 to 794
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        795 non-null    int64 
 1   urgent            795 non-null    int64 
 2   buy               795 non-null    int64 
 3   change            795 non-null    int64 
 4   sell              795 non-null    int64 
 5   price             795 non-null    int64 
 6   gift              795 non-null    int64 
 7   search            795 non-null    int64 
 8   repair            795 non-null    int64 
 9   parts             795 non-null    int64 
 10  synt_brand        795 non-null    object
 11  description       794 non-null    object
 12  city              795 non-null    object
 13  seen              795 non-null    int64 
 14  published_dt      795 non-null    object
 15  expire_dt         795 non-null    object
 16  date_scrapped_dt  795 non-null    object
dtypes: int64(11), ob

In [5]:
df.describe()

Unnamed: 0.1,Unnamed: 0,urgent,buy,change,sell,price,gift,search,repair,parts,seen
count,795.0,795.0,795.0,795.0,795.0,795.0,795.0,795.0,795.0,795.0,795.0
mean,405.500629,0.001258,0.021384,0.040252,0.943396,624.56478,0.001258,0.006289,0.0,0.001258,609.144654
std,234.108343,0.035466,0.144751,0.196672,0.231229,987.873816,0.035466,0.079105,0.0,0.035466,1323.73943
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
25%,202.5,0.0,0.0,0.0,1.0,120.0,0.0,0.0,0.0,0.0,152.0
50%,404.0,0.0,0.0,0.0,1.0,300.0,0.0,0.0,0.0,0.0,312.0
75%,609.5,0.0,0.0,0.0,1.0,700.0,0.0,0.0,0.0,0.0,621.5
max,809.0,1.0,1.0,1.0,1.0,12000.0,1.0,1.0,0.0,1.0,22551.0


In [6]:
df = df.drop(labels=['description'], axis = 1)

In [7]:
df.isna().sum()

Unnamed: 0          0
urgent              0
buy                 0
change              0
sell                0
price               0
gift                0
search              0
repair              0
parts               0
synt_brand          0
city                0
seen                0
published_dt        0
expire_dt           0
date_scrapped_dt    0
dtype: int64

In [8]:
ndf = df.copy()

In [9]:
ndf['published_dt'] = pd.to_datetime(df['published_dt'])
ndf['expire_dt'] = pd.to_datetime(df['expire_dt'])
ndf['date_scrapped_dt'] = pd.to_datetime(df['date_scrapped_dt'])

In [10]:
ndf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 795 entries, 0 to 794
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Unnamed: 0        795 non-null    int64         
 1   urgent            795 non-null    int64         
 2   buy               795 non-null    int64         
 3   change            795 non-null    int64         
 4   sell              795 non-null    int64         
 5   price             795 non-null    int64         
 6   gift              795 non-null    int64         
 7   search            795 non-null    int64         
 8   repair            795 non-null    int64         
 9   parts             795 non-null    int64         
 10  synt_brand        795 non-null    object        
 11  city              795 non-null    object        
 12  seen              795 non-null    int64         
 13  published_dt      795 non-null    datetime64[ns]
 14  expire_dt         795 non-

Instead of using an exploratory data visualization, we can use a quantifiable measure to detect the presence of outliers.

- **The skewness value**: When the skewness value is between -1 and +1, it indicates that the variable has no significant outliers to worry about. We obtain the `skew()'` value of a variable as shown below:

In [11]:
asimetric_columns = ['urgent','buy','change','sell','price','gift','search','repair','parts','seen']

for i in asimetric_columns:
    print(f"{i}: ",ndf[i].skew())

urgent:  28.19574435974338
buy:  6.6296564811679985
change:  4.687062525386084
sell:  -3.844792060410648
price:  4.590560045250744
gift:  28.195744359743408
search:  12.513872828053973
repair:  0.0
parts:  28.1957443597434
seen:  10.562636123296826


In [12]:
ndf["price"].describe()

count      795.000000
mean       624.564780
std        987.873816
min          0.000000
25%        120.000000
50%        300.000000
75%        700.000000
max      12000.000000
Name: price, dtype: float64

### Eliminación de valores atípicos: uso del valor IQR

El otro método popular para identificar valores atípicos es utilizar el valor del rango intercuartílico (**IQR**). 

**IQR** es una forma de detectar valores atípicos. Asume que todos los puntos de datos que están dentro de 1,5 veces un rango intercuartílico (RIC) de los cuartiles superior e inferior son "normales" y cualquier cosa fuera de ese rango debe considerarse un valor atípico. 

Para calcular el RIC, primero necesitamos encontrar el primer cuartil (Q1) y el tercer cuartil (Q3). A continuación, podemos calcular el RIC restando Q1 de Q3.

In [13]:
q1 = []
q3 = []
lower = []
upper = []
cnt = 0

# Calcular Q1 y Q3 para cada columna asimétrica
for col in asimetric_columns:
    q1.append(ndf[col].quantile(q=0.25))
    q3.append(ndf[col].quantile(q=0.75))

# Calcular el IQR
iqr = [q3[i] - q1[i] for i in range(len(q1))]

# Calcular lower y upper
for i in range(len(iqr)):
    lower.append(q1[i] - 1.5 * iqr[i])
    upper.append(q3[i] + 1.5 * iqr[i])

# Eliminar outliers
for idx, col in enumerate(asimetric_columns):
    ndf[col] = ndf[col].clip(lower[cnt], upper[cnt])
    cnt += 1


In [14]:
ndf["price"].describe()

count     795.000000
mean      494.003774
std       492.275113
min         0.000000
25%       120.000000
50%       300.000000
75%       700.000000
max      1570.000000
Name: price, dtype: float64

In [15]:
ndf["ratio_seen_price"] = ndf["seen"] / ndf["price"]

In [16]:
ndf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 795 entries, 0 to 794
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Unnamed: 0        795 non-null    int64         
 1   urgent            795 non-null    int64         
 2   buy               795 non-null    int64         
 3   change            795 non-null    int64         
 4   sell              795 non-null    int64         
 5   price             795 non-null    int64         
 6   gift              795 non-null    int64         
 7   search            795 non-null    int64         
 8   repair            795 non-null    int64         
 9   parts             795 non-null    int64         
 10  synt_brand        795 non-null    object        
 11  city              795 non-null    object        
 12  seen              795 non-null    float64       
 13  published_dt      795 non-null    datetime64[ns]
 14  expire_dt         795 non-

In [17]:
%matplotlib inline

In [18]:
mldf = ndf.copy()

In [19]:
mldf = mldf.drop(['published_dt','expire_dt','date_scrapped_dt','urgent','buy','change','sell','gift','search','repair','parts'],axis = 1)

In [20]:
import numpy as np
import numpy.random as rnd

In [21]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(mldf, test_size=0.2, random_state=42)

In [23]:
len(train_set)

636

In [24]:
len(test_set)

159

In [None]:
......................

In [None]:
synt_brand_cat = ndf[["synt_brand"]]

In [None]:
synt_brand_cat.head(10)

In [None]:
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
hispasonic_cat_1hot = cat_encoder.fit_transform(synt_brand_cat)
hispasonic_cat_1hot

In [None]:
hispasonic_cat_1hot.toarray()

Lista de categorias haciendo utilizando la variable instancia categories_ del codificador

In [None]:
cat_encoder.categories_

In [None]:
.............

In [None]:
mldf = ndf.copy()

In [None]:
mldf = mldf.drop(['published_dt','expire_dt','date_scrapped_dt'],axis = 1)

In [None]:
mldf.info()

In [None]:
#ndf['price'] = np.where(ndf['price'] > 900, 900, ndf['price']) # 900 force to be between +1 and -1
#print(ndf['price'].skew())

In [None]:
#ndf = pd.get_dummies(ndf, columns=['price'], drop_first=True, prefix='Price')
#ndf = pd.get_dummies(ndf, columns=['seen'], drop_first=True, prefix='Seen')

In [None]:
#target_variable = ['Price'] 
#predictors = list(set(list(ndf.columns)) - set(target_variable))
#predictors

In [None]:
#ndf[predictors] = ndf[predictors] / ndf[predictors].max()