<a href="https://colab.research.google.com/github/ansiyo/Machine-Learning-Rep/blob/main/PROJECT_34.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# SIMPLEIMPUTER

# Imputing Missing Numerical Values

import numpy as np
from sklearn.impute import SimpleImputer

# Sample data with missing values
data = np.array([[1, 2], [np.nan, 3], [7, np.nan]])

# Initialize SimpleImputer to fill missing values with the column mean
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# Fit the imputer on data and transform the data
transformed_data = imputer.fit_transform(data)

print(transformed_data)

[[1.  2. ]
 [4.  3. ]
 [7.  2.5]]


In [None]:
# Imputing Missing Categorical Values

import pandas as pd
from sklearn.impute import SimpleImputer

# Sample data with missing categorical values
data = pd.DataFrame({
    'color': ['red', 'green', np.nan, 'blue', 'green'],
    'size': ['S', np.nan, 'M', 'L', 'L']
})

# Initialize SimpleImputer to fill missing values with the most frequent value
imputer = SimpleImputer(strategy='most_frequent')

# Fit and transform
transformed_data = imputer.fit_transform(data)

print(transformed_data)

In [None]:
import pandas                              # SLIDE EXAMPLE
import numpy as np

# Step 1: Create data with missing values
data = {'one': pandas.Series([1, 2, 5], index=['a', 'b', 'e']),
        'two': pandas.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}
# We create a dictionary data with two series: one and two.
# The series have missing values because the indexes don't completely overlap.
# one has no values for index 'c' and 'd', while two has no value for index 'e'.

# Step 2: Convert to DataFrame
table = pandas.DataFrame(data)
print("Before:\n", table)
# This converts the dictionary into a Pandas DataFrame, resulting in a table where some values are NaN (missing).

# Step 3: Use SimpleImputer to fill missing values with the column mean
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(table)
# The fit() method calculates the mean for each column based on the non-missing values.

# Step 4: Transform the data and display the output
imputed_data = imputer.transform(table.values)
print("After:\n", pandas.DataFrame(imputed_data, columns=['one', 'two']))
# The transform() method replaces the missing values with the previously calculated means.
# The imputed_data is then converted back into a DataFrame for easier readability, with the same column names.

In [None]:
# HANDLING CATEGORIAL VALUES

import pandas as pd

data = {
    'Customer_id': pd.Series([1, 2, 3, 4, 5]),
    'Loan_type': pd.Series(['Home Loan', 'Personal Loan', 'Education Loan', 'Home Loan', 'Credit Loan']),
    'Income': pd.Series(['30K', '25K', '15K', '40K', '35K'])
}

loan_info = pd.DataFrame(data)
# This line converts the dictionary into a pandas DataFrame called loan_info. A DataFrame is a 2-dimensional labeled data structure with columns of potentially different types.
# TABLE FORM
# Customer_id	           Loan_type	           Income
# 1	                      Home Loan	             30K
# 2	                    Personal Loan	           25K
# 3	                    Education Loan	         15K
# 4	                     Home Loan	             40K
# 5	                    Credit Loan	             35K
print(loan_info)

In [None]:
# GOT_DUMMIES

import pandas as pd

data = {
    'Customer_id': pd.Series([1, 2, 3, 4, 5]),
    'Loan_type': pd.Series(['Home Loan', 'Personal Loan', 'Education Loan', 'Home Loan', 'Credit Loan']),
    'Income': pd.Series(['30K', '25K', '15K', '40K', '35K'])
}

loan_info = pd.DataFrame(data)
loan_info = pd.get_dummies(loan_info,prefix_sep='_',drop_first=True)
# This line is key. It converts the text in the Loan_type and Income columns into new columns with numbers. Here's what happens:
# Loan_type: For each type of loan (like 'Home Loan', 'Personal Loan', etc.), a new column is created, and a 1 or 0 is used to indicate if the customer has that type of loan.
# Income: The same thing happens for the income values (like '30K', '25K', etc.).
# drop_first=True:
# This means the first category after oredering it (like 'credit loan' and '15K') is dropped to avoid duplication, but you still have all the necessary information to figure out the dropped category.
print(loan_info)

In [None]:
# LABEL ENCODER()

import pandas as pd

# Creating the data dictionary with customer information
data = {
    'Customer_id': pd.Series([1, 2, 3, 4, 5]),
    'Loan_type': pd.Series(['Home Loan', 'Personal Loan', 'Education Loan', 'Home Loan', 'Credit Loan']),
    'Income': pd.Series(['30K', '25K', '15K', '40K', '35K'])
}

# Creating the DataFrame from the data dictionary
loan_info = pd.DataFrame(data)

from sklearn.preprocessing import LabelEncoder
# Imports the LabelEncoder class from the sklearn.preprocessing module, which is part of the scikit-learn library.
labelencoder = LabelEncoder()
# Creates an instance of LabelEncoder called labelencoder. This object will be used to transform the categorical data (like Loan_type and Income) into numeric labels.
loan_info_upd= loan_info.apply(labelencoder.fit_transform)
# the fit_transform method of LabelEncoder is applied. It assigns a unique numeric code to each distinct value in that column.
# For example, if the column has values like 'Home Loan', 'Personal Loan', and 'Credit Loan', LabelEncoder will assign them numeric codes like 0, 1, 2, etc
print(loan_info_upd)

1. Label Encoding Process:
When you apply LabelEncoder to a column, it transforms each unique value into a numeric code, starting from 0. The encoding is done based on the lexicographical (alphabetical) order of the unique values in the column.

Step-by-Step Explanation of Label Encoding:
For Loan_type:
The unique loan types in the column are:
['Home Loan', 'Personal Loan', 'Education Loan', 'Credit Loan']
LabelEncoder sorts these values alphabetically:
['Credit Loan', 'Education Loan', 'Home Loan', 'Personal Loan']
Then, it assigns numeric codes starting from 0:
'Credit Loan' → 0
'Education Loan' → 1
'Home Loan' → 2
'Personal Loan' → 3
So, after applying the encoding, the column Loan_type will have these numeric values in place of the text labels.

For Income:
The unique income values in the column are:
['30K', '25K', '15K', '40K', '35K']
LabelEncoder sorts these values alphabetically:
['15K', '25K', '30K', '35K', '40K']
Then, it assigns numeric codes starting from 0:
'15K' → 0
'25K' → 1
'30K' → 2
'35K' → 3
'40K' → 4
So, the income values are encoded into their respective numeric codes based on alphabetical sorting.

In [None]:
# ONE HOT ENCODER()

import pandas as pd

# Creating the data dictionary with customer information
data = {
    'Customer_id': pd.Series([1, 2, 3, 4, 5]),
    'Loan_type': pd.Series(['Home Loan', 'Personal Loan', 'Education Loan', 'Home Loan', 'Credit Loan']),
    'Income': pd.Series(['30K', '25K', '15K', '40K', '35K'])
}

# Creating the DataFrame from the data dictionary
loan_info = pd.DataFrame(data)

from sklearn.preprocessing import OneHotEncoder
#  Imports the OneHotEncoder from the sklearn.preprocessing module. OneHotEncoder is used to convert categorical variables into a binary matrix (one-hot encoding).
onehotencoder = OneHotEncoder()
# Creates an instance of OneHotEncoder called onehotencoder. This object will be used to convert categorical values into one-hot encoded values.
x = onehotencoder.fit_transform(loan_info).toarray()
print(x)

The toarray() method is used to convert a sparse matrix into a dense numpy array.

Sparse Matrix vs Dense Array:
Sparse Matrix:

A matrix where most of the elements are zero.
It's stored in a memory-efficient way by only keeping track of the non-zero elements, saving memory.
OneHotEncoder often returns a sparse matrix by default to save space, especially when there are many categories (which would result in many zeros).
Dense Array:

A matrix where all elements, including zeros, are stored explicitly.
This takes more memory but is easier to work with for printing and some types of computations.

In [None]:
# TEST TRAIN SPLIT

import pandas as pd
from sklearn.model_selection import train_test_split
# train_test_split from sklearn.model_selection is used to split the dataset into training and testing sets.
df= pd.read_csv("cereal.csv")
df.head()
# first five rows of the DataFrame.

# Assuming you have a DataFrame named 'df'

# Restructuring the DataFrame
x = df[['calories', 'protein']].values
# This line selects the calories and protein columns from the DataFrame df and stores their values in the variable x. These are the input features that the model will learn from.
y = df['rating'].values
# The rating column is the target variable (the value we want to predict), and its values are stored in y.
# So, x is a 2D array containing the calories and protein data, while y is a 1D array containing the cereal ratings.

# Splitting the data-set into training and testing into 80%-20%
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
# train_test_split(x, y, test_size=0.2): This function splits the data into training and testing sets.
# x (input features) and y (target values) are split into two parts: training and testing.
# test_size=0.2: This means 20% of the data will be set aside for testing, and 80% will be used for training.
#x_train: The training set for input features (80% of the calories and protein data).
# x_test: The testing set for input features (20% of the calories and protein data).
# y_train: The training set for the target variable (rating), corresponding to the training features.
# y_test: The testing set for the target variable (rating), corresponding to the testing features.

# Print the shapes of the training and testing sets
print('Trained data-set:', x_train.shape)
# 80 rows for training, 2 features (calories, protein)
print('Test data-set:', x_test.shape)
# 20 rows for testing, 2 features (calories, protein).