## Tasks
1. Perform min-max scaling on a these set of values [171, 120, 86, 176, 77]. The input values range from 32 to 212 and the output should range from 0 to 100.
2. Perform standardization (2-score normalization) on a dataset with the following values: [50, 60, 70, 89, 90]. Ensure the transformed values have a mean of 0 and a standard deviation of 1.
3. Convert the categorical labels ['cat', 'dog', 'fish', 'cat', 'dog'] into numerical labels using label encoding.
4. Apply one-hot encoding to the categorical variable ['apple', 'banana", "orange", "banana", "banana", "apple", orange", "orange"].
5. Split the dataset X - [[1], [2], [3], [4], [5], [6], [7], [8]] and y - [10, 20, 30, 40, 50, 69, 70, 89] into training and testing sets with a test size of 25%.
6. Generate a confusion matrix for the true labels [1, 0, 1, 1, 0] and the predicted labels [1, 0, 0, 1, 1]
7. Train a random forest model on the breast-cancer dataset and output the importance of each feature.
8. Train a support vector regression model on the house-pricing dataset and check if the model performs better than the linear regression model.


In [27]:
# 1. Perform min-max scaling on a these set of values [171, 120, 86, 176, 77]. The input values range from 32 to 212 and the output should range from 0 to 100. Use scikit learn

import pandas as pd
data = pd.DataFrame({'values': [171, 120, 86, 176, 77]})
data['scaled_values'] = (data['values'] - 32) / (212 - 32) * (100 - 0) + 0
print(data)


   values  scaled_values
0     171      77.222222
1     120      48.888889
2      86      30.000000
3     176      80.000000
4      77      25.000000


In [28]:
#2. Perform standardization (2-score normalization) on a dataset with the following values: [50, 60, 70, 89, 90]. Ensure the transformed values have a mean of 0 and a standard deviation of 1.

data = pd.DataFrame({'values': [50, 60, 70, 80, 90]})
mean = data['values'].mean()
std_dev = data['values'].std()
data['z_scores'] = (data['values'] - mean) / std_dev
print(data)

   values  z_scores
0      50 -1.264911
1      60 -0.632456
2      70  0.000000
3      80  0.632456
4      90  1.264911


In [29]:
#3. Convert the categorical labels ['cat', 'dog', 'fish', 'cat', 'dog'] into numerical labels using label encoding.

from sklearn.preprocessing import LabelEncoder
data = ['cat', 'dog', 'fish', 'cat', 'dog']
encoder = LabelEncoder()
encoded_data = encoder.fit_transform(data)
print(encoded_data)


[0 1 2 0 1]


In [30]:
#4. Apply one-hot encoding to the categorical variable ['apple', 'banana", "orange", "banana", "banana", "apple", orange", "orange"].

from sklearn.preprocessing import OneHotEncoder
data = ['apple', 'banana', 'orange', 'banana', 'banana', 'apple', 'orange', 'orange']
encoder = OneHotEncoder()
encoded_data = encoder.fit_transform(np.array(data).reshape(-1, 1))
print(encoded_data.toarray())


[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 0. 1.]]


In [31]:
#5. Split the dataset X - [[1], [2], [3], [4], [5], [6], [7], [8]] and y - [10, 20, 30, 40, 50, 69, 70, 89] into training and testing sets with a test size of 25%.

from sklearn.model_selection import train_test_split
X = np.array([[1], [2], [3], [4], [5], [6], [7], [8]])
y = np.array([10, 20, 30, 40, 50, 69, 70, 89])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
print("X_train : ",X_train)
print("X_test : ",X_test)
print("y_train : ",y_train)
print("y_test : ",y_test)



X_train :  [[3]
 [7]
 [6]
 [2]
 [5]
 [8]]
X_test :  [[1]
 [4]]
y_train :  [30 70 69 20 50 89]
y_test :  [10 40]


In [32]:
#6. Generate a confusion matrix for the true labels [1, 0, 1, 1, 0] and the predicted labels [1, 0, 0, 1, 1]

from sklearn.metrics import confusion_matrix
true_labels = [1, 0, 1, 1, 0]
predicted_labels = [1, 0, 0, 1, 1]
confusion_mat = confusion_matrix(true_labels, predicted_labels)
print(confusion_mat)


[[1 1]
 [1 2]]


In [33]:
#7. Train a random forest model on the breast-cancer dataset and output the importance of each feature.

import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
data = pd.read_csv("/home/shreya/Documents/Stuff/PES/sem5/machineLearning/lab1/classification/data.csv")
X = data.drop(columns=['diagnosis'])
y = data['diagnosis']
model = RandomForestClassifier()
model.fit(X, y)
importances = model.feature_importances_
print(importances)

[0.0049816  0.04984449 0.0121135  0.02985625 0.06148343 0.00686525
 0.01443843 0.0550635  0.07907604 0.00306517 0.00321906 0.01470763
 0.0050357  0.01088446 0.03188131 0.00394623 0.00458477 0.00472829
 0.00268088 0.00367832 0.00520972 0.08673133 0.01721012 0.12495754
 0.1406343  0.00906086 0.01824997 0.04119069 0.13724838 0.01144061
 0.00593217 0.        ]


In [34]:
#8. Train a support vector regression model on the house-pricing dataset and check if the model performs better than the linear regression model.

from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression

data = pd.read_csv("/home/shreya/Documents/Stuff/PES/sem5/machineLearning/lab1/regression/sample_submission.csv")
X = data.drop(columns=['SalePrice'])
y = data['SalePrice']
svr_model = SVR()
linear_model = LinearRegression()
svr_model.fit(X, y)
linear_model.fit(X, y)
svr_score = svr_model.score(X, y)
linear_score = linear_model.score(X, y)
print("SVR Score : ",svr_score)
print("Linear Score : ",linear_score)

SVR Score :  0.0005705290552791009
Linear Score :  0.015917529104617967
