In [122]:
# import the libraries
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [123]:
raw_train = pd.read_csv('train.csv')
raw_test = pd.read_csv('test.csv')
raw_train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [124]:
raw_train.shape

(614, 13)

In [125]:
raw_train.nunique() # Pega quantos tipos de atributos tem

Loan_ID              614
Gender                 2
Married                2
Dependents             4
Education              2
Self_Employed          2
ApplicantIncome      505
CoapplicantIncome    287
LoanAmount           203
Loan_Amount_Term      10
Credit_History         2
Property_Area          3
Loan_Status            2
dtype: int64

In [126]:
train_df = raw_train.copy()
test_df = raw_test.copy()

In [127]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [128]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362 entries, 0 to 361
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            362 non-null    object 
 1   Gender             351 non-null    object 
 2   Married            362 non-null    object 
 3   Dependents         353 non-null    object 
 4   Education          362 non-null    object 
 5   Self_Employed      339 non-null    object 
 6   ApplicantIncome    362 non-null    int64  
 7   CoapplicantIncome  362 non-null    int64  
 8   LoanAmount         362 non-null    int64  
 9   Loan_Amount_Term   356 non-null    float64
 10  Credit_History     333 non-null    float64
 11  Property_Area      362 non-null    object 
dtypes: float64(2), int64(3), object(7)
memory usage: 34.1+ KB


In [129]:
train_y = train_df['Loan_Status'].copy()

In [130]:
train_df.drop(columns = ["Loan_Status"], inplace = True)

In [131]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 57.7+ KB


In [132]:
# Dropping the uncessary columns
train_df.drop(columns = 'Loan_ID', inplace = True)
test_df.drop(columns = 'Loan_ID', inplace = True)

In [133]:
train_df.columns

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area'],
      dtype='object')

In [134]:
#Duplicates
train_df[train_df.duplicated()]

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area


In [135]:
test_df[test_df.duplicated()]

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
192,Male,No,0,Graduate,Yes,5833,0,116,360.0,1.0,Urban


In [136]:
test_df.drop_duplicates(inplace = True)
test_df[test_df.duplicated()]

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area


In [137]:
# Missing values analysis
train_df.isna().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
dtype: int64

In [138]:
train_df.columns

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area'],
      dtype='object')

In [139]:
#Numeric --> Mean
# Categorical --> mode

num_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term']
cat_cols = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Credit_History', 'Property_Area']

In [140]:
cat_imputer = SimpleImputer(strategy = "most_frequent")
cat_imputer.fit(train_df[cat_cols])

train_df[cat_cols] = cat_imputer.transform(train_df[cat_cols])
test_df[cat_cols] = cat_imputer.transform(test_df[cat_cols])

In [141]:
num_imputer = SimpleImputer(strategy = 'mean')
num_imputer.fit(train_df[num_cols])

train_df[num_cols] = num_imputer.transform(train_df[num_cols])
test_df[num_cols] = num_imputer.transform(test_df[num_cols])

In [142]:
train_df.isna().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
dtype: int64

In [143]:
train_df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,Male,No,0,Graduate,No,5849.0,0.0,146.412162,360.0,1.0,Urban
1,Male,Yes,1,Graduate,No,4583.0,1508.0,128.0,360.0,1.0,Rural
2,Male,Yes,0,Graduate,Yes,3000.0,0.0,66.0,360.0,1.0,Urban
3,Male,Yes,0,Not Graduate,No,2583.0,2358.0,120.0,360.0,1.0,Urban
4,Male,No,0,Graduate,No,6000.0,0.0,141.0,360.0,1.0,Urban


In [144]:
train_df["ApplicantIncome"] = train_df["ApplicantIncome"] +train_df["CoapplicantIncome"]
test_df["ApplicantIncome"] = test_df["ApplicantIncome"] + test_df["CoapplicantIncome"]
#Drop the co-applicant income
train_df.drop( columns = 'CoapplicantIncome', inplace = True)
test_df.drop( columns = 'CoapplicantIncome', inplace = True)

In [145]:
train_df.head()


Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,Male,No,0,Graduate,No,5849.0,146.412162,360.0,1.0,Urban
1,Male,Yes,1,Graduate,No,6091.0,128.0,360.0,1.0,Rural
2,Male,Yes,0,Graduate,Yes,3000.0,66.0,360.0,1.0,Urban
3,Male,Yes,0,Not Graduate,No,4941.0,120.0,360.0,1.0,Urban
4,Male,No,0,Graduate,No,6000.0,141.0,360.0,1.0,Urban


In [146]:
#Application of Label Encoder
train_df.nunique()

Gender                2
Married               2
Dependents            4
Education             2
Self_Employed         2
ApplicantIncome     554
LoanAmount          204
Loan_Amount_Term     11
Credit_History        2
Property_Area         3
dtype: int64

In [147]:
train_df.Dependents.unique() # Ordinal data -> label encoder


array(['0', '1', '2', '3+'], dtype=object)

In [148]:
train_df.Property_Area.unique()

array(['Urban', 'Rural', 'Semiurban'], dtype=object)

In [149]:
for col in cat_cols :
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.fit_transform(test_df[col])
train_df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,0,0,0,0,5849.0,146.412162,360.0,1,2
1,1,1,1,0,0,6091.0,128.0,360.0,1,0
2,1,1,0,0,1,3000.0,66.0,360.0,1,2
3,1,1,0,1,0,4941.0,120.0,360.0,1,2
4,1,0,0,0,0,6000.0,141.0,360.0,1,2


In [150]:
#Log transformation
num_cols.remove('CoapplicantIncome')

#tranformação
train_df[num_cols] = np.log(train_df[num_cols])
test_df[num_cols] = np.log(test_df[num_cols])

In [151]:
#Scaling
minmax = MinMaxScaler()
train_df = minmax.fit_transform(train_df)
test_df = minmax.fit_transform(test_df)

In [152]:
train_df.head()

AttributeError: 'numpy.ndarray' object has no attribute 'head'

In [None]:
train_df.columns

In [None]:
# Building a model
from sklearn.model_selection import train_test_split
np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(train_df,train_y,test_size = 0.3, random_state =42)

In [None]:
from sklearn.linear_model import LogisticRegression
log = LogisticRegression()
log.fit(X_train, y_train)

In [None]:
y_pred_test = log.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, y_pred_test)
print(f"Accuracy is {acc}")

In [None]:
#Serializaton e disserialization
import joblib
joblib.dump(log, "my_trained_model_v1.pkl")

In [None]:
#Dissearilization
final_model = joblib.load("my_trained_model_v1.pkl")

In [None]:
final_model.intercept_, final_model.coef_

## Create custom data transformers

In [153]:
# Key thing --> Inherit - BaseEstimator, TransfomerMinMax
# implement fit and transform
# accept input with __init__ method

from sklearn.base import BaseEstimator, TransformerMixin

class DemoTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X,y = None):
        return self

    def transform(self,X):
        return X

In [175]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd

class MeanImputer(BaseEstimator, TransformerMixin):
    def __init__(self, variables=None):
        self.variables = variables

    def fit(self, X, y=None):
        # Dicionário para armazenar as médias das colunas
        self.mean_dict = {}
        for col in self.variables:
            self.mean_dict[col] = X[col].mean()
        return self
        
    def transform(self, X):
        # Copiar o DataFrame para não modificar o original
        X = X.copy()
        for col in self.variables:
            X[col].fillna(self.mean_dict[col], inplace=True)
        return X


In [176]:

np.random.seed(42)

# Corrigido para gerar números inteiros aleatórios de 0 a 99 com tamanho 10x2
df = pd.DataFrame(np.random.randint(0, 100, (10, 2)), columns=['A', 'B'])

# Inserir valores NaN em posições específicas
df.iloc[1, 0] = np.nan
df.iloc[2, 1] = np.nan
df.iloc[3, 1] = np.nan
df.iloc[4, 0] = np.nan

# Exibir o DataFrame resultante
print(df)


      A     B
0  51.0  92.0
1   NaN  71.0
2  60.0   NaN
3  82.0   NaN
4   NaN  74.0
5  87.0  99.0
6  23.0   2.0
7  21.0  52.0
8   1.0  87.0
9  29.0  37.0


In [177]:
mean_imputer = MeanImputer(variables = ['A', 'B'])

In [178]:
mean_imputer.fit(df)

In [179]:
mean_imputer.mean_dict

{'A': 44.25, 'B': 64.25}

In [180]:
df.mean()

A    44.25
B    64.25
dtype: float64

In [182]:
#Preenchendo os dados faltantes
mean_imputer.transform(df)

Unnamed: 0,A,B
0,51.0,92.0
1,44.25,71.0
2,60.0,64.25
3,82.0,64.25
4,44.25,74.0
5,87.0,99.0
6,23.0,2.0
7,21.0,52.0
8,1.0,87.0
9,29.0,37.0
