In [1]:
#@title TASK 1

import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error,r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("house_data.csv")
print("missing values in dataset")
print(df.isnull().sum())

numeric_cols = ['square_footage','bedrooms','bathrooms','age']
for col in numeric_cols:
  df[col] = df[col].fillna(df[col].mean())


le = LabelEncoder()
df['neighborhood_encoded'] = le.fit_transform(df['neighborhood'])
print("missing values in dataset after cleaning:")
print(df.isnull().sum())

features = ['square_footage', 'bedrooms', 'bathrooms', 'age', 'neighborhood_encoded']
target = 'price'

X = df[features]
Y = df[target]

X_train, X_test, Y_train, Y_test= train_test_split(X,Y,test_size=0.2,random_state=42)

model = LinearRegression()
model.fit(X_train,Y_train)

y_pred = model.predict(X_test)

mae = mean_absolute_error(Y_test, y_pred)
mse = mean_squared_error(Y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(Y_test, y_pred)

print(f"MAE: {mae:.2f}")
print(f"MSE: {mse:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.2f}")

new_data = pd.DataFrame([{
    'square_footage': 2000,
    'bedrooms': 3,
    'bathrooms': 2,
    'age': 5,
    'neighborhood_encoded': le.transform(['Downtown'])[0]
}], columns=features)


predicted_price = model.predict(new_data)[0]
print("Predicted Price: $",predicted_price)

missing values in dataset
square_footage    0
bedrooms          0
bathrooms         0
age               0
neighborhood      0
price             0
dtype: int64
missing values in dataset after cleaning:
square_footage          0
bedrooms                0
bathrooms               0
age                     0
neighborhood            0
price                   0
neighborhood_encoded    0
dtype: int64
MAE: 93634.12
MSE: 9958408112.69
RMSE: 99791.82
R²: 0.36
Predicted Price: $ 505364.6477132265


In [2]:
#@title TASK 2

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

df = pd.read_csv("emails.csv")
print(df['label'].value_counts())

X = df['text']
Y = df['label']

vectorizer = TfidfVectorizer(stop_words='english',max_df=0.7)
X_vect = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_vect, Y, test_size=0.2, random_state=42)

model = MultinomialNB()
model.fit(X_train,y_train)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

new_email = "Congratulations! You've won a $1000 gift card. Click here to claim."
email_vect = vectorizer.transform([new_email])
prediction = model.predict(email_vect)[0]

print("\nSpam" if prediction == 1 else "\nNot Spam")

label
1    5
0    5
Name: count, dtype: int64
Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         1

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2

Confusion Matrix:
 [[1 0]
 [0 1]]

Spam


In [3]:
#@title TASK 3

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from scipy import stats

df = pd.read_csv("customer_data.csv")
df.fillna(df.mean(), inplace=True)
df = df[(np.abs(stats.zscore(df.select_dtypes(include=['float64', 'int64']))) < 3).all(axis=1)]

features = ['total_spending', 'age', 'num_visits', 'purchase_frequency']
target = 'customer_value'

X = df[features]
y = df[target]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

model = SVC(kernel='linear')
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

new_customer = [4000, 35, 12, 0.7]
new_customer_vect = scaler.transform([new_customer])
prediction = model.predict(new_customer_vect)[0]

print("\nHigh-Value Customer" if prediction == 1 else "Low-Value Customer")


Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00         2

    accuracy                           1.00         4
   macro avg       1.00      1.00      1.00         4
weighted avg       1.00      1.00      1.00         4

Confusion Matrix:
 [[2 0]
 [0 2]]

High-Value Customer


