Title: Popular Classification Algorithms


K Nearst Neighbors (KNN)

Task 1: Classify fruits based on weight and color.

In [1]:
# Write your code here
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Example fruit dataset (replace with your actual dataset)
data = {
    'weight': [150, 200, 130, 120, 180, 160, 100, 250, 190, 210],
    'color': ['Red', 'Yellow', 'Red', 'Green', 'Yellow', 'Red', 'Green', 'Yellow', 'Red', 'Yellow'],
    'fruit': ['Apple', 'Banana', 'Apple', 'Grapes', 'Banana', 'Apple', 'Grapes', 'Banana', 'Apple', 'Banana']
}

# Create DataFrame
df = pd.DataFrame(data)

# Features (weight and color) and labels (fruit)
X = df[['weight', 'color']]
y = df['fruit']

# Convert categorical data (color) to numerical data using LabelEncoder
label_encoder = LabelEncoder()
X['color'] = label_encoder.fit_transform(X['color'])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['color'] = label_encoder.fit_transform(X['color'])


Accuracy: 1.00
Classification Report:
              precision    recall  f1-score   support

       Apple       1.00      1.00      1.00         2
      Banana       1.00      1.00      1.00         1

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3



Task 2: Predict customer clothing size based on height and weight.

In [2]:
# Write your code here
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Example dataset (replace with your actual dataset)
data = {
    'height': [160, 175, 180, 165, 170, 158, 185, 168, 174, 162],
    'weight': [55, 70, 80, 60, 65, 50, 85, 68, 72, 58],
    'clothing_size': ['Medium', 'Large', 'Large', 'Medium', 'Large', 'Small', 'Large', 'Medium', 'Large', 'Small']
}

# Create DataFrame
df = pd.DataFrame(data)

# Features (height and weight) and labels (clothing size)
X = df[['height', 'weight']]
y = df['clothing_size']

# Convert categorical labels (clothing size) to numerical values using LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))



Accuracy: 0.67
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         1

    accuracy                           0.67         3
   macro avg       0.33      0.33      0.33         3
weighted avg       0.67      0.67      0.67         3



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Task 3: Determine optimal movie recommendation based on viewer preferences.

In [3]:
# Write your code here
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Sample Data: Replace with your movie ratings dataset (userID, movieID, rating)
data = {
    'userID': [1, 1, 2, 2, 3, 3, 4, 4, 5],
    'movieID': [101, 102, 101, 103, 102, 104, 103, 105, 101],
    'rating': [5, 3, 4, 2, 5, 1, 3, 4, 5]
}

# Create DataFrame
df = pd.DataFrame(data)

# Create a pivot table with userID as rows and movieID as columns
pivot_df = df.pivot(index='userID', columns='movieID', values='rating')

# Fill NaN values with 0 (assuming unrated movies are rated as 0 by default)
pivot_df = pivot_df.fillna(0)

# Standardizing the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(pivot_df)

# Compute cosine similarity between users
cosine_sim = cosine_similarity(scaled_data)

# Convert the cosine similarity matrix to a DataFrame for easy visualization
cosine_sim_df = pd.DataFrame(cosine_sim, index=pivot_df.index, columns=pivot_df.index)

# Function to recommend movies based on user ID
def recommend_movies(user_id, top_n=3):
    # Get similar users
    similar_users = cosine_sim_df[user_id].sort_values(ascending=False)
    
    # Exclude the user themselves
    similar_users = similar_users.drop(user_id)
    
    # Get the top N similar users
    top_similar_users = similar_users.head(top_n)
    
    # Get the movies rated by the top similar users
    recommended_movies = set()
    for similar_user in top_similar_users.index:
        rated_movies = pivot_df.loc[similar_user][pivot_df.loc[similar_user] > 0].index
        recommended_movies.update(rated_movies)
    
    # Return the recommended movies
    return list(recommended_movies)

# Example: Recommend movies for user 1
recommended_for_user_1 = recommend_movies(1)
print(f"Movies recommended for User 1: {recommended_for_user_1}")



Movies recommended for User 1: [104, 101, 102, 103]
