# Data Preprocessing

<[Dataset Link](https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data)>

Importing the libraries and datasets



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
orig_train_Data=pd.read_csv('/content/train.csv',index_col='id')
orig_test_Data=pd.read_csv('/content/test.csv',index_col='id')

In [None]:
orig_train_Data.head()

In [None]:
orig_test_Data.head()

# Data Exploration (train)

In [None]:
orig_train_Data.shape

In [None]:
orig_train_Data.info()

In [None]:
orig_train_Data.select_dtypes(include='object').columns.any()

In [None]:
orig_train_Data.select_dtypes(include=['float64','int64']).columns

In [None]:
len(orig_train_Data.select_dtypes(include=['float64','int64']).columns)

In [None]:
#Statistical Summary
orig_train_Data.describe()

In [None]:
orig_train_Data.columns

# Dealing with the missing values

In [None]:
#Checking if there are any missing values
orig_train_Data.isnull().values.any()

In [None]:
orig_test_Data.isnull().values.any()

# Correlation Matrix and HeatMap

In [None]:
hardness_Data=orig_train_Data.drop(columns='Hardness')

In [None]:
hardness_Data.head()

In [None]:
hardness_Data.corrwith(orig_train_Data['Hardness']).plot.bar(figsize=(20,10),title='Correlation with Hardness',rot=45,grid=True)

In [None]:
#Correlation Matrix
cor_Mat=orig_train_Data.corr()

In [None]:
cor_Mat

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(cor_Mat,annot=True)

# Splitting the train and test set

In [None]:
#Matrix of Features
X=orig_train_Data.iloc[:,0:-1].values

In [None]:
X.astype('float64')

In [None]:
y=orig_train_Data.iloc[:,-1].values

In [None]:
y.astype('float64')

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
y_train.shape

In [None]:
y_test.shape

# Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
sc=StandardScaler()

In [None]:
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)

In [None]:
X_train

In [None]:
X_test

# Building the Model

In [None]:
from sklearn.metrics import precision_score,accuracy_score,confusion_matrix,f1_score,recall_score,mean_absolute_error

In [None]:
from sklearn.model_selection import cross_val_score

Random Forest


In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
model_RF=RandomForestRegressor(random_state=0)
model_RF.fit(X_train,y_train)

In [None]:
predictions=model_RF.predict(X_test)

In [None]:
mae=mean_absolute_error(y_test,predictions)
print('MAE: ',mae)

### Cross Validation

Random Search to find best Parameters (Random Forest Regressor)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
parameters={
    'n_estimators': np.arange(100, 600, 100),
    'max_features': ['auto', 'sqrt'],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False],
    'criterion': ['mae']
}

In [None]:
parameters

In [None]:
random_search=RandomizedSearchCV(estimator=model_RF,param_distributions=parameters,n_iter=5,scoring='neg_mean_absolute_error',n_jobs=-1,cv=5,verbose=3)

In [None]:
random_search.fit(X_train,y_train)

In [None]:
random_search.best_estimator_

In [None]:
random_search.best_score_

In [None]:
random_search.best_params_

FINAL MODEL

In [None]:
from sklearn.ensemble import RandomForestRegressor
model=RandomForestRegressor(criterion='absolute_error', max_depth=15,
                      max_features='auto', min_samples_leaf=2,
                      min_samples_split=10, n_estimators=400, random_state=0)
model.fit(X_train,y_train)

In [None]:
fin_pred=model.predict(X_test)

In [None]:
mae=mean_absolute_error(y_test,fin_pred)
print('MAE: ',mae)

### Cross Validation

In [None]:
accuracies=cross_val_score(estimator=model,X=X_train,y=y_train,cv=10)
print('Accuracy is {:.2F} %'.format(accuracies.mean()*100))
print('Standard Deviation is {:.2F} %'.format(accuracies.std()*100))