In [6]:
#solution1) 

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [7]:
# Load the dataset
url = "https://github.com/FlipRoboTechnologies/ML_-Datasets/blob/main/Census%20Income/Census%20Income.csv?raw=true"
data = pd.read_csv(url)


In [8]:
# Clean column names by stripping leading/trailing whitespace
data.columns = data.columns.str.strip()


In [9]:
# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(data.head())


First few rows of the dataset:
   Age          Workclass  Fnlwgt   Education  Education_num  \
0   50   Self-emp-not-inc   83311   Bachelors             13   
1   38            Private  215646     HS-grad              9   
2   53            Private  234721        11th              7   
3   28            Private  338409   Bachelors             13   
4   37            Private  284582     Masters             14   

        Marital_status          Occupation    Relationship    Race      Sex  \
0   Married-civ-spouse     Exec-managerial         Husband   White     Male   
1             Divorced   Handlers-cleaners   Not-in-family   White     Male   
2   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
3   Married-civ-spouse      Prof-specialty            Wife   Black   Female   
4   Married-civ-spouse     Exec-managerial            Wife   White   Female   

   Capital_gain  Capital_loss  Hours_per_week  Native_country  Income  
0             0             0        

In [10]:
# Preprocess the dataset
# Handle missing values (if any)
data = data.dropna()


In [11]:
# Encode categorical variables
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])


In [12]:
# Separate features and target variable
X = data.drop('Income', axis=1)
y = data['Income']


In [13]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [14]:
# Standardize the feature variables
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [15]:
# Train the Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)


In [16]:
# Make predictions on the testing set
y_pred = model.predict(X_test)


In [18]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Model Accuracy: {:.2f}%".format(accuracy * 100))
print("Classification Report:")
print(report)



Model Accuracy: 82.02%
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.94      0.89      4912
           1       0.70      0.46      0.56      1600

    accuracy                           0.82      6512
   macro avg       0.77      0.70      0.72      6512
weighted avg       0.81      0.82      0.81      6512



In [21]:
# Save the trained model and preprocessing objects (if needed)
import joblib
joblib.dump(model, 'logistic_regression_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
for column, encoder in label_encoders.items():
    joblib.dump(encoder, f'label_encoder_{column}.pkl')

print("Model and preprocessing objects saved.")


Model and preprocessing objects saved.


In [24]:
#solution2)

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error
import joblib
import numpy as np


In [25]:
# Load the dataset
url = "https://github.com/FlipRoboTechnologies/ML_-Datasets/blob/main/Rainfall%20Forecast/Rainfall.csv?raw=true"
data = pd.read_csv(url)


In [26]:
# Clean column names by stripping leading/trailing whitespace
data.columns = data.columns.str.strip()


In [27]:
# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(data.head())


First few rows of the dataset:
         Date Location  MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine  \
0  2008-12-01   Albury     13.4     22.9       0.6          NaN       NaN   
1  2008-12-02   Albury      7.4     25.1       0.0          NaN       NaN   
2  2008-12-03   Albury     12.9     25.7       0.0          NaN       NaN   
3  2008-12-04   Albury      9.2     28.0       0.0          NaN       NaN   
4  2008-12-05   Albury     17.5     32.3       1.0          NaN       NaN   

  WindGustDir  WindGustSpeed WindDir9am  ... Humidity9am  Humidity3pm  \
0           W           44.0          W  ...        71.0         22.0   
1         WNW           44.0        NNW  ...        44.0         25.0   
2         WSW           46.0          W  ...        38.0         30.0   
3          NE           24.0         SE  ...        45.0         16.0   
4           W           41.0        ENE  ...        82.0         33.0   

   Pressure9am  Pressure3pm  Cloud9am  Cloud3pm  Temp9am  Temp3pm  

In [29]:
# Preprocess the dataset
data = data.ffill().bfill()


In [30]:
# Encode categorical variables
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])


In [31]:
# Task 1: Predict whether it will rain tomorrow (classification)
X_classification = data.drop(['RainTomorrow'], axis=1)
y_classification = data['RainTomorrow']


In [32]:
# Split the dataset into training and testing sets for classification
X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(X_classification, y_classification, test_size=0.2, random_state=42)


In [33]:
# Standardize the feature variables
scaler_cls = StandardScaler()
X_train_cls = scaler_cls.fit_transform(X_train_cls)
X_test_cls = scaler_cls.transform(X_test_cls)


In [34]:
# Train the Logistic Regression model for classification
model_cls = LogisticRegression()
model_cls.fit(X_train_cls, y_train_cls)


In [35]:
# Make predictions on the testing set for classification
y_pred_cls = model_cls.predict(X_test_cls)


In [36]:
# Evaluate the classification model
accuracy_cls = accuracy_score(y_test_cls, y_pred_cls)
report_cls = classification_report(y_test_cls, y_pred_cls)

print("Classification Model Accuracy: {:.2f}%".format(accuracy_cls * 100))
print("Classification Report:")
print(report_cls)


Classification Model Accuracy: 85.40%
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.95      0.91      1304
           1       0.75      0.54      0.62       381

    accuracy                           0.85      1685
   macro avg       0.81      0.74      0.77      1685
weighted avg       0.85      0.85      0.84      1685



In [37]:
# Task 2: Predict the amount of rainfall (regression)
# Prepare data for regression (exclude rows with zero RainTomorrow)
X_regression = data.drop(['RainTomorrow'], axis=1)
y_regression = data['RainTomorrow'].astype(float)


In [38]:
# Split the dataset into training and testing sets for regression
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_regression, y_regression, test_size=0.2, random_state=42)


In [39]:
# Standardize the feature variables
scaler_reg = StandardScaler()
X_train_reg = scaler_reg.fit_transform(X_train_reg)
X_test_reg = scaler_reg.transform(X_test_reg)


In [40]:
# Train the Linear Regression model for regression
model_reg = LinearRegression()
model_reg.fit(X_train_reg, y_train_reg)


In [41]:
# Make predictions on the testing set for regression
y_pred_reg = model_reg.predict(X_test_reg)


In [42]:
# Evaluate the regression model
mse_reg = mean_squared_error(y_test_reg, y_pred_reg)
rmse_reg = mse_reg ** 0.5

print("Regression Model RMSE: {:.2f}".format(rmse_reg))


Regression Model RMSE: 0.35


In [43]:
# Save the trained models and preprocessing objects
joblib.dump(model_cls, 'logistic_regression_rainfall_model.pkl')
joblib.dump(scaler_cls, 'scaler_cls.pkl')
joblib.dump(model_reg, 'linear_regression_rainfall_model.pkl')
joblib.dump(scaler_reg, 'scaler_reg.pkl')
for column, encoder in label_encoders.items():
    joblib.dump(encoder, f'label_encoder_{column}.pkl')

print("Models and preprocessing objects saved.")


Models and preprocessing objects saved.


In [44]:
#solution3)

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error
import joblib
import numpy as np


In [45]:
# Load the dataset
url = "https://raw.githubusercontent.com/FlipRoboTechnologies/ML_-Datasets/main/Insurance%20Claim%20Fraud%20Detection/Automobile_insurance_fraud.csv"
data = pd.read_csv(url, header=None)


In [46]:
# Manually assign column names
data.columns = [
    'months_as_customer', 'age', 'policy_number', 'policy_bind_date', 'policy_state',
    'policy_csl', 'policy_deductable', 'policy_annual_premium', 'umbrella_limit',
    'insured_zip', 'insured_sex', 'insured_education_level', 'insured_occupation',
    'insured_hobbies', 'insured_relationship', 'capital-gains', 'capital-loss',
    'incident_date', 'incident_type', 'collision_type', 'incident_severity',
    'authorities_contacted', 'incident_state', 'incident_city', 'incident_location',
    'incident_hour_of_the_day', 'number_of_vehicles_involved', 'property_damage',
    'bodily_injuries', 'witnesses', 'police_report_available', 'total_claim_amount',
    'injury_claim', 'property_claim', 'vehicle_claim', 'auto_make', 'auto_model',
    'auto_year', 'fraud_reported'
]


In [47]:
# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(data.head())


First few rows of the dataset:
   months_as_customer  age  policy_number policy_bind_date policy_state  \
0                 328   48         521585       17-10-2014           OH   
1                 228   42         342868       27-06-2006           IN   
2                 134   29         687698       06-09-2000           OH   
3                 256   41         227811       25-05-1990           IL   
4                 228   44         367455       06-06-2014           IL   

  policy_csl  policy_deductable  policy_annual_premium  umbrella_limit  \
0    250/500               1000                1406.91               0   
1    250/500               2000                1197.22         5000000   
2    100/300               2000                1413.14         5000000   
3    250/500               2000                1415.74         6000000   
4   500/1000               1000                1583.91         6000000   

   insured_zip  ... witnesses police_report_available total_claim_amount 

In [48]:
# Check for the presence of the '_c39' column and drop it if it exists
if '_c39' in data.columns:
    data = data.drop(columns=['_c39'])


In [49]:
# Display the columns to verify the dataset structure
print("Dataset columns:")
print(data.columns)


Dataset columns:
Index(['months_as_customer', 'age', 'policy_number', 'policy_bind_date',
       'policy_state', 'policy_csl', 'policy_deductable',
       'policy_annual_premium', 'umbrella_limit', 'insured_zip', 'insured_sex',
       'insured_education_level', 'insured_occupation', 'insured_hobbies',
       'insured_relationship', 'capital-gains', 'capital-loss',
       'incident_date', 'incident_type', 'collision_type', 'incident_severity',
       'authorities_contacted', 'incident_state', 'incident_city',
       'incident_location', 'incident_hour_of_the_day',
       'number_of_vehicles_involved', 'property_damage', 'bodily_injuries',
       'witnesses', 'police_report_available', 'total_claim_amount',
       'injury_claim', 'property_claim', 'vehicle_claim', 'auto_make',
       'auto_model', 'auto_year', 'fraud_reported'],
      dtype='object')


In [50]:
# Preprocess the dataset
data = data.ffill().bfill()


In [51]:
# Encode categorical variables
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])


In [52]:
# Task 1: Predict whether the claim is fraudulent (classification)
X_classification = data.drop(['fraud_reported'], axis=1)
y_classification = data['fraud_reported']


In [53]:
# Split the dataset into training and testing sets for classification
X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(X_classification, y_classification, test_size=0.2, random_state=42)


In [54]:
# Standardize the feature variables
scaler_cls = StandardScaler()
X_train_cls = scaler_cls.fit_transform(X_train_cls)
X_test_cls = scaler_cls.transform(X_test_cls)


In [55]:
# Train the Logistic Regression model for classification
model_cls = LogisticRegression()
model_cls.fit(X_train_cls, y_train_cls)


In [56]:
# Make predictions on the testing set for classification
y_pred_cls = model_cls.predict(X_test_cls)


In [57]:
# Evaluate the classification model
accuracy_cls = accuracy_score(y_test_cls, y_pred_cls)
report_cls = classification_report(y_test_cls, y_pred_cls)

print("Classification Model Accuracy: {:.2f}%".format(accuracy_cls * 100))
print("Classification Report:")
print(report_cls)


Classification Model Accuracy: 70.00%
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.88      0.81       145
           1       0.41      0.22      0.29        55

    accuracy                           0.70       200
   macro avg       0.58      0.55      0.55       200
weighted avg       0.66      0.70      0.67       200



In [58]:
# Task 2: Predict the total claim amount (regression)
# Prepare data for regression (excluding rows with zero total_claim_amount if any)
X_regression = data.drop(['total_claim_amount'], axis=1)
y_regression = data['total_claim_amount'].astype(float)


In [59]:
# Split the dataset into training and testing sets for regression
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_regression, y_regression, test_size=0.2, random_state=42)


In [60]:
# Standardize the feature variables
scaler_reg = StandardScaler()
X_train_reg = scaler_reg.fit_transform(X_train_reg)
X_test_reg = scaler_reg.transform(X_test_reg)


In [61]:
# Train the Linear Regression model for regression
model_reg = LinearRegression()
model_reg.fit(X_train_reg, y_train_reg)


In [62]:
# Make predictions on the testing set for regression
y_pred_reg = model_reg.predict(X_test_reg)


In [63]:
# Evaluate the regression model
mse_reg = mean_squared_error(y_test_reg, y_pred_reg)
rmse_reg = mse_reg ** 0.5

print("Regression Model RMSE: {:.2f}".format(rmse_reg))


Regression Model RMSE: 0.00


In [65]:
# Save the trained models and preprocessing objects
joblib.dump(model_cls, 'logistic_regression_fraud_model.pkl')
joblib.dump(scaler_cls, 'scaler_cls.pkl')
joblib.dump(model_reg, 'linear_regression_claim_model.pkl')
joblib.dump(scaler_reg, 'scaler_reg.pkl')
for column, encoder in label_encoders.items():
    joblib.dump(encoder, f'label_encoder_{column}.pkl')


print("Models and preprocessing objects saved.")


Models and preprocessing objects saved.


In [67]:
#solution4) 

import pandas as pd    
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report
import joblib

In [68]:
# Load the datasets
zomato_url = "https://raw.githubusercontent.com/FlipRoboTechnologies/ML_-Datasets/main/Z_Restaurant/zomato.csv"
country_code_url = "https://github.com/FlipRoboTechnologies/ML_-Datasets/blob/main/Z_Restaurant/Country-Code.xlsx?raw=true"


In [69]:
# Read the CSV and Excel files with specified encoding
zomato = pd.read_csv(zomato_url, encoding='ISO-8859-1')
country_code = pd.read_excel(country_code_url)


In [70]:
# Merge datasets on 'Country Code'
data = pd.merge(zomato, country_code, on='Country Code', how='left')


In [71]:
# Display the first few rows of the merged dataset
print("First few rows of the merged dataset:")
print(data.head())


First few rows of the merged dataset:
   Restaurant ID         Restaurant Name  Country Code              City  \
0        6317637        Le Petit Souffle           162       Makati City   
1        6304287        Izakaya Kikufuji           162       Makati City   
2        6300002  Heat - Edsa Shangri-La           162  Mandaluyong City   
3        6318506                    Ooma           162  Mandaluyong City   
4        6314302             Sambo Kojin           162  Mandaluyong City   

                                             Address  \
0  Third Floor, Century City Mall, Kalayaan Avenu...   
1  Little Tokyo, 2277 Chino Roces Avenue, Legaspi...   
2  Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...   
3  Third Floor, Mega Fashion Hall, SM Megamall, O...   
4  Third Floor, Mega Atrium, SM Megamall, Ortigas...   

                                     Locality  \
0   Century City Mall, Poblacion, Makati City   
1  Little Tokyo, Legaspi Village, Makati City   
2  Edsa Shangri-La, O

In [72]:
# Display the columns of the dataset
print("Dataset columns:")
print(data.columns)


Dataset columns:
Index(['Restaurant ID', 'Restaurant Name', 'Country Code', 'City', 'Address',
       'Locality', 'Locality Verbose', 'Longitude', 'Latitude', 'Cuisines',
       'Average Cost for two', 'Currency', 'Has Table booking',
       'Has Online delivery', 'Is delivering now', 'Switch to order menu',
       'Price range', 'Aggregate rating', 'Rating color', 'Rating text',
       'Votes', 'Country'],
      dtype='object')


In [73]:
# Preprocess the dataset
data = data.ffill().bfill()


In [74]:
# Encode categorical variables
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])


In [75]:
# Task 1: Predict the 'Average Cost for two' (regression)
X_regression = data.drop(['Average Cost for two', 'Price range'], axis=1)
y_regression = data['Average Cost for two']


In [76]:
# Split the dataset into training and testing sets for regression
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_regression, y_regression, test_size=0.2, random_state=42)


In [77]:
# Standardize the feature variables
scaler_reg = StandardScaler()
X_train_reg = scaler_reg.fit_transform(X_train_reg)
X_test_reg = scaler_reg.transform(X_test_reg)


In [80]:
# Train the Linear Regression model for regression
model_reg = LinearRegression()
model_reg.fit(X_train_reg, y_train_reg)


In [81]:
# Make predictions on the testing set for regression
y_pred_reg = model_reg.predict(X_test_reg)


In [83]:
# Evaluate the regression model
mse_reg = mean_squared_error(y_test_reg, y_pred_reg)
rmse_reg = mse_reg ** 0.5

print("Regression Model RMSE (Average Cost for two): {:.2f}".format(rmse_reg))



Regression Model RMSE (Average Cost for two): 19227.95


In [84]:
# Task 2: Predict the 'Price range' (classification)
X_classification = data.drop(['Price range'], axis=1)
y_classification = data['Price range']


In [85]:
# Split the dataset into training and testing sets for classification
X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(X_classification, y_classification, test_size=0.2, random_state=42)


In [86]:
# Standardize the feature variables
scaler_cls = StandardScaler()
X_train_cls = scaler_cls.fit_transform(X_train_cls)
X_test_cls = scaler_cls.transform(X_test_cls)


In [89]:
# Train the Logistic Regression model for classification
model_cls = LogisticRegression()
model_cls.fit(X_train_cls, y_train_cls)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [90]:
# Make predictions on the testing set for classification
y_pred_cls = model_cls.predict(X_test_cls)


In [91]:
# Evaluate the classification model
accuracy_cls = accuracy_score(y_test_cls, y_pred_cls)
report_cls = classification_report(y_test_cls, y_pred_cls)

print("Classification Model Accuracy (Price range): {:.2f}%".format(accuracy_cls * 100))
print("Classification Report:")
print(report_cls)


Classification Model Accuracy (Price range): 66.56%
Classification Report:
              precision    recall  f1-score   support

           1       0.75      0.88      0.81       907
           2       0.57      0.48      0.52       618
           3       0.55      0.53      0.54       266
           4       0.54      0.29      0.38       120

    accuracy                           0.67      1911
   macro avg       0.60      0.55      0.56      1911
weighted avg       0.65      0.67      0.65      1911



In [92]:
# Save the trained models and preprocessing objects
joblib.dump(model_reg, 'linear_regression_avg_cost_model.pkl')
joblib.dump(scaler_reg, 'scaler_reg.pkl')
joblib.dump(model_cls, 'logistic_regression_price_range_model.pkl')
joblib.dump(scaler_cls, 'scaler_cls.pkl')
for column, encoder in label_encoders.items():
    joblib.dump(encoder, f'label_encoder_{column}.pkl')

print("Models and preprocessing objects saved.")


Models and preprocessing objects saved.
