# Machine Learning Applications:
### 1- Sales Prediction Models: Use machine learning to predict future sales based on various features.

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Import file
df = pd.read_csv("Video_Games.csv")


# Select features for prediction and the target variable
features = df[['Platform', 'Genre', 'Publisher', 'Critic_Score', 'User_Score']].copy()
target = df['Global_Sales']

# Handle missing values by imputation or dropping
features['User_Score'] = pd.to_numeric(features['User_Score'], errors='coerce')
features_clean = features.dropna()  # Dropping rows with missing values
target_clean = df.loc[features_clean.index, 'Global_Sales']  # Ensure target aligns with features

# Convert categorical variables using One-Hot Encoding
categorical_features = ['Platform', 'Genre', 'Publisher']
numerical_features = ['Critic_Score', 'User_Score']

# Create preprocessing pipelines for both numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

# Create the preprocessing and training pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', LinearRegression())])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_clean, target_clean, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')


Mean Squared Error (MSE): 1.934878350073142
Root Mean Squared Error (RMSE): 1.3909990474738443


### 2- Recommendation Systems: Develop a system to recommend games to users based on their preferences or similar user profiles.

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel


# Fill NaN values with the string 'missing' in the categorical columns
df.fillna({'Genre': 'missing', 'Platform': 'missing', 'Publisher': 'missing'}, inplace=True)

# Create a combined feature by concatenating the string representations of 'Genre', 'Platform', and 'Publisher'
df['combined_features'] = (df['Genre'] + " " + df['Platform'] + " " + df['Publisher']).astype(str)

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Construct the TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_features'])

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Function to get recommendations based on a game the user likes
def get_recommendations(title, cosine_sim=cosine_sim):
    # Check if the title exists in the DataFrame
    if title not in df['Name'].values:
        return f"No recommendations found for '{title}' as it's not in the dataset."

    # Get the index of the game that matches the title
    idx = df.index[df['Name'] == title].tolist()[0]

    # Get the pairwise similarity scores of all games with that game
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the games based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar games
    sim_scores = sim_scores[1:11]

    # Get the game indices
    game_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar games
    return df['Name'].iloc[game_indices]

# Define sample titles for testing the recommendation system
# this will select the first 5 games in the dataset and recommend similar games
sample_titles = df['Name'].dropna().unique()[:5]

# Testing the recommender with the sample titles
for title in sample_titles:
    print(f"Recommendations for {title}:")
    recommendations = get_recommendations(title)
    print(recommendations)
    print("\n")


Recommendations for Wii Sports:
3                           Wii Sports Resort
13                                    Wii Fit
15                               Wii Fit Plus
591                    Mario Strikers Charged
809                          Mario Sports Mix
911                      Mario Super Sluggers
1307     New Play Control! Mario Power Tennis
6364                              Punch-Out!!
7184                         Super Swing Golf
12181                            Mario Tennis
Name: Name, dtype: object


Recommendations for Super Mario Bros.:
22                     Super Mario Bros. 3
98                     Super Mario Bros. 2
574     Super Mario Bros.: The Lost Levels
698                            Mario Bros.
997                             Kid Icarus
1010                     Kirby's Adventure
1192                  Donkey Kong Classics
1262                           Ice Climber
1809                           Donkey Kong
1843                       Donkey Kong Jr.
Name: Name,