<a href="https://colab.research.google.com/github/ankit-genzeon/AI-ML-Bootcamp-Genzeon-2023/blob/master/case_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
dataset = '/content/drive/MyDrive/Colab Notebooks/SUV_Purchase.csv'
data = pd.read_csv(dataset)

In [None]:
data.head()

In [None]:
import matplotlib.pyplot as plt

# Visualize the distribution of age
plt.hist(data['Age'])
plt.xlabel('Age')
plt.ylabel('Count')
plt.title('Age Distribution')
plt.show()

# Visualize the purchase status by gender
purchase_by_gender = data.groupby('Gender')['Purchased'].mean()
plt.bar(purchase_by_gender.index, purchase_by_gender)
plt.xlabel('Gender')
plt.ylabel('Purchase Probability')
plt.title('Purchase Probability by Gender')
plt.show()


In [5]:
data.columns


Index(['User ID', 'Gender', 'Age', 'EstimatedSalary', 'Purchased'], dtype='object')

In [None]:
import seaborn as sns

# Select the columns with numerical data
numeric_columns = ['Age', 'EstimatedSalary', 'Purchased']

# Calculate the correlation matrix
correlation_matrix = data[numeric_columns].corr()

# Create the heatmap
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()


In [None]:
sns.plotting_context('notebook')
sns.pairplot(data, hue='Gender', palette = 'tab20')

In [None]:
'''Before splitting the data and training a model, it's often necessary to preprocess the data.
This may include handling missing values, encoding categorical variables, or scaling numerical features.
The specific preprocessing steps depend on your dataset and the machine learning algorithm you plan to use.'''

In [9]:
#preprocessing the data

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Drop rows with missing values
data = data.dropna()

# Encode categorical variables
label_encoder = LabelEncoder()
data['Gender'] = label_encoder.fit_transform(data['Gender'])

# Split the data into train and test sets
X = data.drop('Purchased', axis=1)
y = data['Purchased']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [None]:
data

In [None]:
import seaborn as sns

# Select the columns with numerical data
numeric_columns = ['Age', 'Gender', 'EstimatedSalary', 'Purchased']

# Calculate the correlation matrix
correlation_matrix = data[numeric_columns].corr()

# Create the heatmap
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()


In [None]:
X_train

In [None]:
#training the model
from sklearn.linear_model import LogisticRegression

# Initialize the model
model = LogisticRegression()

# Train the model
model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

# Calculate and display the confusion matrix
confusion = confusion_matrix(y_test, y_pred)
print('Confusion Matrix : ')
print(confusion)


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
model_score = model.score(X,y)*100

# Print the evaluation metrics
print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R2):", r2)
print("Model Score:", model_score)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Scatter plot of actual vs. predicted values
plt.scatter(y_test, y_pred)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Scatter Plot of Actual vs. Predicted Values')
plt.show()

# Residual plot
residuals = y_test - y_pred
plt.scatter(y_pred, residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()

# Distribution plot of actual vs. predicted values
sns.kdeplot(y_test, label='Actual Values')
sns.kdeplot(y_pred, label='Predicted Values')
plt.xlabel('Target Variable')
plt.ylabel('Density')
plt.title('Distribution Plot of Actual vs. Predicted Values')
plt.legend()
plt.show()

# Heatmap of correlation matrix
corr = data.corr()  # assuming 'df' is your DataFrame containing the data
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# Print the evaluation metrics
print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R2):", r2)
print("Model Score:", model_score)


In [None]:
model.coef_

In [None]:
# Preprocess the new data
new_data = pd.DataFrame([[27, 50000, 1]], columns=['Age', 'EstimatedSalary', 'Gender'])
new_data['Gender'] = label_encoder.transform(new_data['Gender'])

# Drop the 'User ID' column
new_data = new_data.drop('User ID', axis=1)

# Make predictions for the new data
new_prediction = model.predict(new_data)

# Print the prediction
if new_prediction[0] == 1:
    print("The potential customer is likely to purchase the car.")
else:
    print("The potential customer is unlikely to purchase the car.")
