In [None]:
#!pip install pygwalker
#!pip install ydata_profiling
import pandas as pd
import numpy as np
import pygwalker as pyg
import seaborn as sns
import matplotlib.pyplot as plt
import IPython.display
import ydata_profiling
from scipy.stats import skew
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# EDA + Preprocessing

In [None]:
#Visualizing EDA in a Tableau-like interface
kickstart = pd.read_excel("E:/Users/pc/Downloads/Mining proj/Induvidual proj/Kickstarter.xlsx")
walker = pyg.walk(kickstart)
walker

In [None]:
##INTERACTIVE EDA USING PANDAS PROFILING####
profile = ydata_profiling.ProfileReport(kickstart)
profile.to_file(output_file="E:/output2016_20.html")

In [None]:
# Preprocess text (basic steps)
kickstart['name'] = kickstart['name'].fillna('').str.lower()

# TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(kickstart['name'])

# Function to impute missing values
def impute_category(row):
    if pd.isna(row['category']):
        # Compute similarity
        similarities = cosine_similarity(tfidf_matrix, tfidf_matrix[row.name])
        
        # Get indices of most similar rows
        similar_indices = np.argsort(similarities[:,0])[-3:-1]  # Adjust as needed

        # Impute using the mode of the most similar rows
        similar_categories = kickstart.iloc[similar_indices]['category']
        if not similar_categories.empty:
            mode_result = similar_categories.mode()
            if not mode_result.empty:
                return mode_result.iloc[0]
            else:
                return np.nan
        else:
            return np.nan
    else:
        return row['category']

# Apply imputation
kickstart['category'] = kickstart.apply(impute_category, axis=1)

#Display categories after imputing
kickstart["category"].info()

In [None]:
#Converting goal to goal_usd
kickstart["goal_usd"] = kickstart["goal"] * kickstart["static_usd_rate"]

#Drop invalid predictors
kickstart = kickstart.drop([
    'id', 'name', 'goal', 'pledged', 'deadline', 'created_at', 'launched_at', 'staff_pick',
    'usd_pledged', 'backers_count', 'static_usd_rate', 'name_len', 'blurb_len',
    'state_changed_at', 'state_changed_at_month', 'state_changed_at_day',
    'state_changed_at_yr', 'state_changed_at_hr', 'spotlight',"currency",
    "disable_communication","state_changed_at_weekday"],axis=1)

In [None]:
#Removing cancelled and suspended rows in state column
kickstart = kickstart[kickstart['state'].isin(['successful', 'failed'])]

# Counting missing values in each column
missing_values = kickstart.isnull().sum()
missing_values = missing_values[missing_values > 0]  # Filter columns with missing values

# Plotting
plt.figure(figsize=(10, 6))
sns.barplot(x=missing_values.index, y=missing_values.values)
plt.title('Missing Values Count per Column')
plt.xlabel('Columns')
plt.ylabel('Missing Values Count')
plt.show()

In [None]:
#We only have 150 something missing values in only category column, so we will remove that. (only 1%)
#Removing missing values
kickstart = kickstart.dropna()
kickstart = kickstart.reset_index(drop=True)
kickstart.info()

In [None]:
#Correlation heatmap
plt.figure(figsize=(16, 8)) 
sns.heatmap(kickstart.corr(),annot=True)

In [None]:
#Checking multi colinearity
correlation_matrix = kickstart.corr()

# Filter out pairs with correlation greater than 0.8
high_correlation_pairs = [(i, j) for i in correlation_matrix.columns for j in correlation_matrix.columns 
                          if (i != j) and (abs(correlation_matrix[i][j]) > 0.8)]

print("Pairs with high correlation (> 0.8):")
print(high_correlation_pairs)

In [None]:
#Remove correlated variables
kickstart = kickstart.drop(['deadline_yr','created_at_yr','launch_to_state_change_days'],axis=1)

In [None]:
correlation_matrix = kickstart.corr()

# Filter out pairs with correlation greater than 0.8
high_correlation_pairs = [(i, j) for i in correlation_matrix.columns for j in correlation_matrix.columns 
                          if (i != j) and (abs(correlation_matrix[i][j]) > 0.8)]

print("Pairs with high correlation (> 0.8):")
print(high_correlation_pairs)


#There is no colinearity now

# Feature Engineering

In [None]:
# Create cross-tabulation
cross_tab = pd.crosstab(kickstart['country'], kickstart["state"])
print(cross_tab)

In [None]:
# Step 1: Identify major countries with a significant number of projects
major_countries = ['US', 'GB', 'CA']  # Based on the initial data overview

# Step 2: Create a function to categorize countries
def categorize_country(country):
    if country in major_countries:
        return country
    else:
        return 'Other'

# Step 3: Apply the categorization function to the 'country' column
kickstart['country_grouped'] = kickstart['country'].apply(categorize_country)

# Step 4: Remove original country column
kickstart = kickstart.drop(["country"],axis=1)

In [None]:
# Counting the occurrences of each category
category_counts = kickstart['category'].value_counts()

# Calculating the percentage of each category
category_percentage = (category_counts / category_counts.sum()) * 100

# Plotting
plt.figure(figsize=(26, 6))
sns.barplot(x=category_percentage.index, y=category_percentage.values)
plt.title('Percentage Distribution of Categories')
plt.xlabel('Categories')
plt.ylabel('Percentage (%)')
plt.show()

In [None]:
# Counting the occurrences of each category
category_counts = kickstart['category'].value_counts()

# Calculating cumulative percentage
category_cumulative = category_counts.cumsum() / category_counts.sum() * 100

# Plotting
plt.figure(figsize=(10, 6))
category_cumulative.plot(kind='bar')
plt.title('Cumulative Percentage Distribution of Categories')
plt.xlabel('Categories')
plt.ylabel('Cumulative Percentage (%)')
plt.show()

In [None]:
#Using top 8 categories we are covering 80% of the distribution, so the rest will be classified as others.

# Extracting the top 8 categories from the 'category' column
top_8_categories = kickstart['category'].value_counts().nlargest(8).index.tolist()

# Categorization function to categorize based on the extracted top 8 categories
def categorize_based_on_top_8(category):
    if category in top_8_categories:
        return category
    else:
        return 'Other'

# Apply the categorization function to the 'category' column
kickstart['category_grouped_top_8'] = kickstart['category'].apply(categorize_based_on_top_8)

# Display the first few rows of the dataset to verify the changes
kickstart[['category', 'category_grouped_top_8']].head()

#Removing and assigning grouped values to the original category column
kickstart = kickstart.drop(["category"],axis=1)
kickstart["category"] = kickstart["category_grouped_top_8"]
kickstart = kickstart.drop(["category_grouped_top_8"],axis=1)

In [None]:
#Instead of 2 separate columns. I created a column representing number of days from creation till deadline
kickstart["create_to_deadline_days"] = kickstart["create_to_launch_days"] + kickstart["launch_to_deadline_days"]

#Removing launch_to_deadline and creation_to_launch because they are not needed
kickstart = kickstart.drop(["create_to_launch_days","launch_to_deadline_days"],axis=1)

In [None]:
# Perform one-hot encoding for the solumns that have categories in text-format

for col in ['launched_at_weekday', 'created_at_weekday', 'deadline_weekday', 'category', 'country_grouped',"state"]:
    # Generate dummy variables
    dummies = pd.get_dummies(kickstart[col], prefix=col)

    # Drop the first column of the dummy variables to avoid multicollinearity
    dummies = dummies.iloc[:, 1:]

    # Drop the original column from kickstart
    kickstart = kickstart.drop(col, axis=1)

    # Concatenate the dummy variables with the main DataFrame
    kickstart = pd.concat([kickstart, dummies], axis=1)

In [None]:
kickstart.info()

In [None]:
# Filtering the dataset for numerical columns (int64 and float64)
numerical_columns = kickstart.select_dtypes(include=[np.int64, np.float64])
# Initializing the Isolation Forest model
isolforest = IsolationForest(n_estimators=100, contamination=0.05, random_state=42)

# Fitting the model on the numerical data
pred = isolforest.fit_predict(numerical_columns)
pred = pd.DataFrame(pred)
pred[pred[0] == -1].value_counts()
indices = pred[pred[0] == -1].index

print(len(indices)) #Number of outliers isolated

#Removing outliers
for i in indices:
    if i in kickstart.index:
        kickstart = kickstart.drop(i,axis = 0)

#Reset index after removing        
kickstart = kickstart.reset_index(drop=True)

# Checking Skewness

In [None]:
#Dataframe for the numerical columns
numerical_cols = kickstart.select_dtypes(include=['float64','int64'])

# Calculating skewness for each numerical column
skewness = numerical_cols.apply(lambda x: skew(x.dropna()))  # dropna() to ignore NaN values

#Printing Skewness
skewness

In [None]:
#There were 4 numerical columns that showed a bit skewness on both sides (Goal_usd, blurb_len_clean, create_to_deadline_days,
#launched_at_yr)

kickstart["log_goal"] = np.log(kickstart["goal_usd"])
kickstart = kickstart.drop(["goal_usd"],axis=1)

kickstart["log_blurb_len_clean"] = np.log(kickstart["blurb_len_clean"])
kickstart = kickstart.drop(["blurb_len_clean"],axis=1)

kickstart["log_create_to_deadline_days"] = np.log(kickstart["create_to_deadline_days"])
kickstart = kickstart.drop(["create_to_deadline_days"],axis=1)

kickstart["log_launched_at_yr"] = np.log(kickstart["launched_at_yr"])
kickstart = kickstart.drop(["launched_at_yr"],axis=1)

# Feature Selection

In [None]:
y = kickstart["state_successful"]
X = kickstart.drop(["state_successful"],axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)

importances = clf.feature_importances_
feature_importances = pd.Series(importances, index=X_train.columns)

# Displaying feature importances
print(feature_importances.sort_values(ascending=False))

In [None]:
#Removing columns that have less than 0.01 for all categories e.g. country
kickstart = kickstart.drop(["country_grouped_Other","country_grouped_GB","country_grouped_US"],axis=1)

In [None]:
#Predictors and Target Variable
X = kickstart.drop(["state_successful"],axis=1)
y = kickstart["state_successful"]

# Define the columns to be converted
columns_to_convert = ["created_at_month", "created_at_day", "created_at_hr",
                      "launched_at_month", "launched_at_day","launched_at_hr",
                      "deadline_month", "deadline_day", "deadline_hr"
                     ]

# Use apply to convert each specified column to 'category' type
X[columns_to_convert] = X[columns_to_convert].apply(lambda x: x.astype('category'))

# Random Forest

In [None]:
#Test_train_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,random_state=42)

#Random Forest Classifier
clf = RandomForestClassifier(max_depth=20,min_samples_leaf=2,min_samples_split=3,n_estimators= 300, random_state=42,
                             warm_start=True)
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy using Random Forest Classifier:", accuracy)

# Hyperparameter Tuning Random Forest 

In [None]:
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4, 6, 8]
}

# Initialize a RandomForestClassifier
rf = RandomForestClassifier(random_state=0)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, scoring='accuracy')

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Use the best estimator for further predictions
best_rf = grid_search.best_estimator_
print(best_rf)

# Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Creating a Gradient Boosting Classifier
gb_clf = GradientBoostingClassifier(learning_rate=0.10,max_depth=3,n_estimators=205,random_state=42,min_samples_split=20
                                   ,min_samples_leaf = 2)

# Training the classifier
gb_clf.fit(X_train, y_train)

# Making predictions
y_pred = gb_clf.predict(X_test)

# Evaluating the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy using Gradient Boosting Classifier is: {accuracy * 100:.2f}%")

# Hyperparameter Tuning Gradient Boosting Classifier

In [None]:
param_grid = {
    'learning_rate': [0.1],
    'n_estimators': [100, 205, 200, 300, 250, 400],
    'max_depth': [2, 3],
    "min_samples_leaf" : [2,3,4,6],
    "min_samples_split" : [2,4,6,8,10,15,20]
}

gb_clf = GradientBoostingClassifier(random_state=42)
grid_search = GridSearchCV(estimator=gb_clf, param_grid=param_grid, n_jobs=-1, cv=5, scoring='accuracy')

grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ",grid_search.best_score_)

best_clf = grid_search.best_estimator_
accuracy = best_clf.score(X_test, y_test)
print(f"Test set accuracy: {accuracy}")

# Further Preprocessing for Other Models

In [None]:
# Categorize days into beginning, middle, and end of the month
def categorize_day_of_month(day):
    if 1 <= day <= 10:
        return 'Beginning'
    elif 11 <= day <= 20:
        return 'Middle'
    else:
        return 'End'

# Apply this function to the day column
kickstart['launched_at_day_group'] = kickstart['launched_at_day'].apply(categorize_day_of_month)
kickstart['created_at_day_group'] = kickstart['created_at_day'].apply(categorize_day_of_month)
kickstart['deadline_day_group'] = kickstart['deadline_day'].apply(categorize_day_of_month)

kickstart = kickstart.drop(['launched_at_day','created_at_day','deadline_day'],axis=1)

# Categorize months into quarters
def categorize_month(month):

    if 1 <= month <= 3:
        return 'Q1'

    elif 4 <= month <= 6:
        return 'Q2'

    elif 7 <= month <= 9:
        return 'Q3'

    else:
        return 'Q4'

    
# Apply this function to the months column
kickstart['launched_at_month_quarter'] = kickstart['launched_at_month'].apply(categorize_month)
kickstart['created_at_month_quarter'] = kickstart['created_at_month'].apply(categorize_month)
kickstart['deadline_month_quarter'] = kickstart['deadline_month'].apply(categorize_month)

kickstart = kickstart.drop(['launched_at_month','created_at_month','deadline_month'],axis=1)

def categorize_hour(hour):

    if 0 <= hour < 6:
        return 'Early Morning'

    elif 6 <= hour < 12:
        return 'Morning'

    elif 12 <= hour < 18:
        return 'Afternoon/Evening'

    else:
        return 'Night'


# Apply this function to the hours column
kickstart['launched_at_hour_group'] = kickstart['launched_at_hr'].apply(categorize_hour)
kickstart['created_at_hour_group'] = kickstart['created_at_hr'].apply(categorize_hour)
kickstart['deadline_hour_group'] = kickstart['deadline_hr'].apply(categorize_hour)

kickstart = kickstart.drop(['launched_at_hr','created_at_hr','deadline_hr'],axis=1)

# Dummifying

In [None]:
kickstart_df = kickstart.copy()

for col in kickstart_df.select_dtypes(include=['object']).columns:

    # Get unique values of the column
    unique_values = kickstart_df[col].unique()

    # Initialize an empty DataFrame for the dummies of this column
    column_dummies = pd.DataFrame()


    for value in unique_values:

        # Determine the name of the new dummy column
        dummy_col_name = f"{col}_{value}"

        # Create a dummy column for the value
        kickstart_df[dummy_col_name] = (kickstart_df[col] == value).astype(int)


        # If the value ends with 'others' or 'weekend', drop the first dummy
        if value.endswith('others') or value.endswith('weekend'):
            kickstart_df.drop(dummy_col_name, axis=1, inplace=True)

    # Drop the original column
    kickstart_df.drop(col, axis=1, inplace=True)

kickstart_df = kickstart_df.drop(["category_Other","deadline_day_group_End","created_at_day_group_End"
                                 ,"launched_at_day_group_End","launched_at_month_quarter_Q4","created_at_month_quarter_Q4","deadline_month_quarter_Q4"
                                 ,"launched_at_hour_group_Early Morning","created_at_hour_group_Early Morning","deadline_hour_group_Early Morning"
                                 ,],axis=1)

# Checking Correlation

In [None]:
correlation_matrix = kickstart_df.corr()

# Filter out pairs with correlation greater than 0.8

high_correlation_pairs = [(i, j) for i in correlation_matrix.columns for j in correlation_matrix.columns 
                          if (i != j) and (abs(correlation_matrix[i][j]) >= 0.8)]


print("Pairs with high correlation (> 0.8):")
print(high_correlation_pairs)

#Pairs with high correlation (> 0.8):
[('launched_at_hour_group_Morning', 'deadline_hour_group_Morning'), ('deadline_hour_group_Morning', 'launched_at_hour_group_Morning')]

kickstart_df = kickstart_df.drop(["launched_at_hour_group_Morning"],axis=1)

# ANN

In [None]:
#Standardizing predictors
X = kickstart_df.drop(["state_successful"],axis=1)
scaler = MinMaxScaler()

X_std = scaler.fit_transform(X)
X_std = pd.DataFrame(X_std,columns = X.columns)

#Train_test_split
X_std_train, X_std_test, y_train, y_test = train_test_split(X_std, y, test_size=0.3, random_state=42)

# Define the MLPClassifier
mlp = MLPClassifier(alpha=0.0001,hidden_layer_sizes=(32, 16), max_iter=1000, activation='logistic', solver='adam', random_state=42)

# Train the model
mlp.fit(X_std_train, y_train)

# Make predictions
y_pred = mlp.predict(X_std_test)

# Evaluate the model
print(f"Accuracy using ANN is: {accuracy_score(y_test, y_pred) * 100:.2f}%")

# Hyperparameter Tuning for ANN

In [None]:
# Define a grid of hyperparameters to search
param_grid = {
    'hidden_layer_sizes': [(64, 32), (32, 16), (128, 64)],  # Structure of hidden layers
    'solver': ['adam', 'sgd'],  # Solver for weight optimization
    'alpha': [0.0001, 0.001, 0.01],  # L2 penalty (regularization term) parameter
}
mlp = MLPClassifier(max_iter = 10000, random_state=42)

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=mlp, param_grid=param_grid, scoring='accuracy', cv=5)

# Fit the grid search to the scaled training data
grid_search.fit(X_std_train, y_train)

# Get the best parameters and corresponding model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Predict using the best model
y_test_pred = best_model.predict(X_std_test)

# Best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Evaluate the best model on the test set
best_mlp = grid_search.best_estimator_
test_score = best_mlp.score(X_std_test, y_test)
print(f"Test Accuracy using ANN : {test_score * 100}%")

# Logistic Regression

In [None]:
# Creating a Logistic Regression model
log_reg = LogisticRegression(random_state=42,C=100, penalty='l1',solver='liblinear')

# Training the model
log_reg.fit(X_std_train, y_train)

# Making predictions
y_pred = log_reg.predict(X_std_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy using Logistic regression is: {accuracy}")
print("Classification Report for Logistic Regression is :")
print(report)

# Hyperparameter Tuning Logistic Regression

In [None]:
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization parameter
    'penalty': ['l1', 'l2'],       # Type of regularization
    'solver': ['liblinear', 'saga'] # Algorithm to use for optimization
}

log_reg = LogisticRegression()
grid_search = GridSearchCV(estimator=log_reg, param_grid=param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_std_train, y_train)

print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ",grid_search.best_score_)

# Evaluate the best model on the test set
best_logit = grid_search.best_estimator_
test_score = best_logit.score(X_std_test, y_test)
print(f"Test Accuracy : {test_score * 100}%")

# KNN

In [None]:
# Creating a KNN model
knn = KNeighborsClassifier(n_neighbors=30)  # You can change n_neighbors as needed

# Training the model
knn.fit(X_std_train, y_train)

# Making predictions
y_pred = knn.predict(X_std_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy using KNN is : {accuracy}")

In [None]:
X_std.info()

# Clustering

In [None]:
!pip install tensorflow
import pandas as pd
import numpy as np
import kmodes
import matplotlib.pyplot as plt
from kmodes.kprototypes import KPrototypes
from sklearn.preprocessing import MinMaxScaler
from kmodes import kprototypes
from sklearn.metrics import silhouette_score
from kmodes.kprototypes import KPrototypes, matching_dissim, euclidean_dissim
from sklearn.cluster import KMeans
from sklearn.metrics import calinski_harabasz_score
from scipy.stats import f
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.cluster import DBSCAN
from matplotlib import pyplot
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage

In [None]:
kickstart = pd.read_excel("E:/Users/pc/Downloads/Mining proj/Induvidual proj/Kickstarter.xlsx")

In [None]:
# Preprocess text (basic steps)
kickstart['name'] = kickstart['name'].fillna('').str.lower()

# TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(kickstart['name'])

# Function to impute missing values
def impute_category(row):
    if pd.isna(row['category']):
        # Compute similarity
        similarities = cosine_similarity(tfidf_matrix, tfidf_matrix[row.name])
        
        # Get indices of most similar rows
        similar_indices = np.argsort(similarities[:,0])[-3:-1]  # Adjust as needed

        # Impute using the mode of the most similar rows
        similar_categories = kickstart.iloc[similar_indices]['category']
        if not similar_categories.empty:
            mode_result = similar_categories.mode()
            if not mode_result.empty:
                return mode_result.iloc[0]
            else:
                return np.nan
        else:
            return np.nan
    else:
        return row['category']

# Apply imputation
kickstart['category'] = kickstart.apply(impute_category, axis=1)

In [None]:
#Converting goal to goal_usd by multiplying with static_usd_rate
kickstart["goal_usd"] = kickstart["goal"] * kickstart["static_usd_rate"]

#Drop invalid predictors
kickstart = kickstart.drop([
    'id', 'name', 'goal', 'pledged', 'deadline', 'created_at', 'launched_at',
    'usd_pledged', 'static_usd_rate', 'name_len', 'blurb_len',
    'state_changed_at', 'state_changed_at_month', 'state_changed_at_day',
    'state_changed_at_yr', 'state_changed_at_hr'],axis=1)

In [None]:
#Keeping only rows with state = successful or failed
kickstart = kickstart[kickstart['state'].isin(['successful', 'failed'])]
kickstart = kickstart.dropna()
kickstart = kickstart.reset_index(drop=True)

#Feature engineering to create "create_to_deadline_days"
kickstart["create_to_deadline_days"] = kickstart["create_to_launch_days"] + kickstart["launch_to_deadline_days"]

#Dropiing invalid predictors
kickstart = kickstart.drop(['launch_to_state_change_days'],axis=1)
kickstart = kickstart.drop(["state_changed_at_weekday"],axis=1)
kickstart = kickstart.drop(["currency","disable_communication"],axis=1)

In [None]:
# Filtering the dataset for numerical columns (int64 and float64)
numerical_columns = kickstart.select_dtypes(include=[np.int64, np.float64])
# Initializing the Isolation Forest model
isolforest = IsolationForest(n_estimators=100, contamination=0.05, random_state=42)

# Fitting the model on the numerical data
pred = isolforest.fit_predict(numerical_columns)
pred = pd.DataFrame(pred)
pred[pred[0] == -1].value_counts()
indices = pred[pred[0] == -1].index

print(len(indices)) #Number of outliers isolated

#Removing outliers
for i in indices:
    if i in kickstart.index:
        kickstart = kickstart.drop(i,axis = 0)

#Reset index after removing        
kickstart = kickstart.reset_index(drop=True)
kickstart.info()

In [None]:
#Splitting dataframe into categorical and numerical dataframe
categorical = kickstart.drop(["name_len_clean","blurb_len_clean","goal_usd","create_to_deadline_days","create_to_launch_days","launch_to_deadline_days","backers_count"],axis=1)
numerical = kickstart.drop(categorical,axis=1)

In [None]:
# min max scaler on numerical features

scaler = MinMaxScaler()
numerical_std = scaler.fit_transform(numerical)
numerical_std = pd.DataFrame(numerical_std, columns =numerical.columns)

# Merge data back
X_std = pd.merge(numerical_std, categorical, left_index=True, right_index=True)

In [None]:
#Vieweing merged dataframe
X_std.info()

In [None]:
#!pip install kmodes
cat_num_cols = ["created_at_month", "created_at_day", "created_at_hr", "created_at_yr", "created_at_weekday", 
                      "launched_at_month", "launched_at_day", "launched_at_yr","launched_at_weekday",
                      "launched_at_hr", "deadline_month", "deadline_day", "deadline_hr","deadline_weekday",
                      "deadline_yr","state","spotlight","staff_pick","category","country"]

# Convert these columns to 'category' dtype
for col in cat_num_cols:
    X_std[col] = X_std[col].astype('category')


In [None]:
#Running KProtoypes
kmixed = KPrototypes(n_clusters=4,random_state=50)
cluster = kmixed.fit_predict(X_std, categorical=[7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26])
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print (pd.DataFrame(kmixed.cluster_centroids_, columns=X_std.columns))

In [None]:
#Computing cost which is a combination of cluster cohesion and seperation
costs = []
K = range(2,7)
#Run loop
for num_clusters in K:
    kproto = KPrototypes(n_clusters=num_clusters, init='Cao', n_init=5,random_state=50)
    clusters = kproto.fit_predict(X_std, categorical=[7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26])
    costs.append(kproto.cost_)

In [None]:
#plotting cost v/s number of clusters
plt.plot(K, costs, 'bx-')
plt.xlabel('Number of clusters')
plt.ylabel('Cost')
plt.title('Elbow Method For Optimal k in K-Prototypes')
plt.show()

In [None]:
# Now apply .describe() to each cluster
cluster_descriptions = []
for i in range(kmixed.n_clusters):
    cluster_data = X_std[clusters == i]
    cluster_descriptions.append(cluster_data.describe(include='all'))  # include='all' to get statistics for categorical columns as well

# Display the descriptive statistics for each cluster
print("\nCluster Descriptions:")
for i, desc in enumerate(cluster_descriptions):
    print(f"\nCluster {i}:")
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        print(desc)


In [None]:
# Function to calculate the mixed distance
def mixed_distance(a, b, categorical=None, alpha=0.01):
    if categorical is None:
        num_score = kprototypes.euclidean_dissim(a, b)
        return num_score
    else:
        cat_index = categorical
        a_cat = [a[index] for index in cat_index]
        b_cat = [b[index] for index in cat_index]
        a_num = [a[index] for index in range(len(a)) if index not in cat_index]
        b_num = [b[index] for index in range(len(b)) if index not in cat_index]
        
        a_cat = np.array(a_cat).reshape(1, -1)
        b_cat = np.array(b_cat).reshape(1, -1)
        a_num = np.array(a_num).reshape(1, -1)
        b_num = np.array(b_num).reshape(1, -1)
        
        cat_score = kprototypes.matching_dissim(a_cat, b_cat)
        num_score = kprototypes.euclidean_dissim(a_num, b_num)
        return cat_score + num_score * alpha

# Function to compute the distance matrix
def dm_prototypes(dataset, categorical=None, alpha=0.1):
    if isinstance(dataset, pd.DataFrame):
        dataset = dataset.values
    len_dataset = len(dataset)
    distance_matrix = np.zeros((len_dataset, len_dataset))
    for i in range(len_dataset):
        for j in range(len_dataset):
            distance_matrix[i][j] = mixed_distance(dataset[i], dataset[j], categorical=categorical, alpha=alpha)
    return distance_matrix

# And 'categorical_columns' with indices of your categorical columns
categorical_columns = [7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26]

# Calculate the custom distance matrix for your dataset
distance_matrix = dm_prototypes(X_std, categorical=categorical_columns, alpha=0.1)

# Range of potential clusters
cluster_range = range(2, 7)  # for example, from 2 to 6 clusters

# Silhouette scores list
silhouette_scores = []

# Calculating silhouette scores for different number of clusters
for n_clusters in cluster_range:
    kmixed = KPrototypes(n_clusters=n_clusters, random_state=50)
    cluster_labels = kmixed.fit_predict(X_std, categorical=categorical_columns)
    silhouette_avg = silhouette_score(distance_matrix, cluster_labels, metric='precomputed')
    silhouette_scores.append(silhouette_avg)

# Plotting the silhouette scores
plt.figure(figsize=(10, 6))
plt.plot(cluster_range, silhouette_scores, marker='o')
plt.title('Silhouette Score vs Number of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.grid(True)
plt.show()

In [None]:
score = calinski_harabasz_score(numerical, cluster_labels) #calculates f-score

# Calculate p-value
df1 = 3 # df1 = k-1
df2 = 1261 # df2 = n-k
pvalue = 1-f.cdf(score, df1, df2)
#getting very small p-value

# Finding optimal K
for i in range (2,7):    
    df1=i-1
    df2=22-i
    kmeans = KMeans(n_clusters=i,n_init="auto")
    model = kmeans.fit(numerical)
    labels = model.labels_
    score = calinski_harabasz_score(numerical, cluster_labels)
    print(i,'F-score:',score)
    print(i,'p-value:',1-f.cdf(score, df1, df2))

# Preprocessing for Other Clustering Algorithms

In [None]:
# Step 1: Identify major countries with a significant number of projects
major_countries = ['US', 'GB', 'CA']  # Based on the initial data overview

# Step 2: Create a function to categorize countries
def categorize_country(country):

    if country in major_countries:

        return country

    else:

        return 'Other'
    

# Step 3: Apply the categorization function to the 'country' column

kickstart['country_grouped'] = kickstart['country'].apply(categorize_country)

# Display the first few rows of the dataset to verify the changes

kickstart = kickstart.drop(["country"],axis=1)

In [None]:
#Using top 8 categories we are covering 80% of the distribution, so the rest will be classified as others.

# Extracting the top 8 categories from the 'category' column
top_8_categories = kickstart['category'].value_counts().nlargest(8).index.tolist()

# Categorization function to categorize based on the extracted top 8 categories
def categorize_based_on_top_8(category):
    if category in top_8_categories:
        return category
    else:
        return 'Other'

# Apply the categorization function to the 'category' column
kickstart['category_grouped_top_8'] = kickstart['category'].apply(categorize_based_on_top_8)

# Display the first few rows of the dataset to verify the changes
kickstart[['category', 'category_grouped_top_8']].head()

#Removing and assigning grouped values to the original category column
kickstart = kickstart.drop(["category"],axis=1)
kickstart["category"] = kickstart["category_grouped_top_8"]
kickstart = kickstart.drop(["category_grouped_top_8"],axis=1)

In [None]:
# Categorize days into beginning, middle, and end of the month

def categorize_day_of_month(day):

    if 1 <= day <= 10:

        return 'Beginning'

    elif 11 <= day <= 20:

        return 'Middle'

    else:

        return 'End'

# Apply this function to the day column

kickstart['launched_at_day_group'] = kickstart['launched_at_day'].apply(categorize_day_of_month)

kickstart['created_at_day_group'] = kickstart['created_at_day'].apply(categorize_day_of_month)

kickstart['deadline_day_group'] = kickstart['deadline_day'].apply(categorize_day_of_month)



kickstart = kickstart.drop(['launched_at_day','created_at_day','deadline_day'],axis=1)

# Categorize months into quarters

def categorize_month(month):

    if 1 <= month <= 3:

        return 'Q1'

    elif 4 <= month <= 6:

        return 'Q2'

    elif 7 <= month <= 9:

        return 'Q3'

    else:

        return 'Q4'

    
kickstart['launched_at_month_quarter'] = kickstart['launched_at_month'].apply(categorize_month)

kickstart['created_at_month_quarter'] = kickstart['created_at_month'].apply(categorize_month)

kickstart['deadline_month_quarter'] = kickstart['deadline_month'].apply(categorize_month)

kickstart = kickstart.drop(['launched_at_month','created_at_month','deadline_month'],axis=1)



def categorize_hour(hour):

    if 0 <= hour < 6:

        return 'Early Morning'

    elif 6 <= hour < 12:

        return 'Morning'

    elif 12 <= hour < 18:

        return 'Afternoon/Evening'

    else:

        return 'Night'


# Apply this function to the 'launched_at_hr' column

kickstart['launched_at_hour_group'] = kickstart['launched_at_hr'].apply(categorize_hour)

kickstart['created_at_hour_group'] = kickstart['created_at_hr'].apply(categorize_hour)

kickstart['deadline_hour_group'] = kickstart['deadline_hr'].apply(categorize_hour)

kickstart = kickstart.drop(['launched_at_hr','created_at_hr','deadline_hr'],axis=1)


#Create a copy of kickstart dataframe
kickstart_df = kickstart.copy()


for col in kickstart_df.select_dtypes(include=['object']).columns:

    # Get unique values of the column

    unique_values = kickstart_df[col].unique()

    # Initialize an empty DataFrame for the dummies of this column

    column_dummies = pd.DataFrame()

    for value in unique_values:

        # Determine the name of the new dummy column

        dummy_col_name = f"{col}_{value}"

        # Create a dummy column for the value
        
        kickstart_df[dummy_col_name] = (kickstart_df[col] == value).astype(int)

        # If the value ends with 'others' or 'weekend', drop the first dummy
        
        if value.endswith('others') or value.endswith('weekend'):

            kickstart_df.drop(dummy_col_name, axis=1, inplace=True)

    # Drop the original column

    kickstart_df.drop(col, axis=1, inplace=True)

In [None]:
#Standardizing
scaler = MinMaxScaler()

X = kickstart_df

X_std = scaler.fit_transform(X)
X_std = pd.DataFrame(X_std,columns = X.columns)

# K-means

In [None]:
X_std = pd.DataFrame(X_std,columns=X.columns)
kmeans = KMeans(n_clusters=4, random_state=50)  # 4 clusters
clusters = kmeans.fit_predict(X_std)

# Add the cluster labels to your original dataframe for further analysis
X_std['cluster'] = clusters

# Analyze the results
with pd.option_context("display.max_rows",None,"display.max_columns",None):
    print(X_std.groupby('cluster').mean())  # Examining the mean values of features for each cluster

# Basic visualization (only works if we have 2 or 3 features)
if X_std.shape[1] == 2:
    plt.scatter(X_std[:, 0], X_std[:, 1], c=clusters, cmap='viridis')
    plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='red')
    plt.title("KMeans Clustering")
    plt.show()

In [None]:
#Silhouette score for K-means for 4 clusters
score = silhouette_score(X_std, clusters)

print("Silhouette Score: ", score)

In [None]:
# Calculate F-score
score = calinski_harabasz_score(X_std, clusters) #calculates f-score
score

# Hiearchical Clustering

In [None]:
#Agglomerative/Hierarchical clustering with complete linkage
cluster = AgglomerativeClustering(n_clusters=4,linkage = "complete", metric = "euclidean")
cluster.fit_predict(X_std)
ams = cluster.labels_

# Using scipy's linkage for dendrogram
linked = linkage(X_std, 'complete')

# Plotting the dendrogram
plt.figure(figsize=(16, 7))
dendrogram(linked, orientation='top', labels=ams, distance_sort='descending', show_leaf_counts=True)
plt.title('Hierarchical Clustering Dendrogram (Complete Linkage)')
plt.xlabel('Sample index')
plt.ylabel('Distance')
plt.show()

In [None]:
# Calculate Silhouette Score for hiearchical clustering
silhouette_avg = silhouette_score(X_std, ams)
print("Silhouette Score: ", silhouette_avg)

In [None]:
# Calculate F-score
score = calinski_harabasz_score(X_std, ams) #calculates f-score
score

In [None]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_std)
X_pca = pd.DataFrame(X_pca, columns =['PC1', 'PC2']) 

#Visualizing components
pyplot.scatter(X_pca['PC1'], X_pca['PC2'])
pyplot.xlabel("PC 1")
pyplot.ylabel("PC 2")
pyplot.show()

# DBSCAN

In [None]:
# DBSCAN clustering
dbscan = DBSCAN(eps=2.5, min_samples=30)  # These parameters can be tuned
clusters = dbscan.fit_predict(X_std)


In [None]:
dbscan_silhouette = silhouette_score(X_std, clusters) if len(set(clusters)) > 1 else 0
print(dbscan_silhouette)

In [None]:
# Calculate F-score
score = calinski_harabasz_score(X_std, clusters) #calculates f-score
score

# Auto Encoders

In [None]:
import tensorflow as tf
from tensorflow.keras.backend import floatx
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam

# Split the dataset into training and test sets
X_std = pd.DataFrame(X_std)
X_std = np.asarray(X_std).astype(np.float32)  # Replace with your dataset

# Split the dataset into training and test sets
X_std_train, X_std_test = train_test_split(X_std, test_size=0.2, random_state=42)

# Define the size of the encoded representations
encoding_dim = 32  # Adjust as needed

# Define the input layer
input_layer = Input(shape=(X_std_train.shape[1],))

# Define the encoding layer
encoded = Dense(encoding_dim, activation='relu')(input_layer)

# Define the decoding layer
decoded = Dense(X_std_train.shape[1], activation='relu')(encoded)

# Build the autoencoder model
autoencoder = Model(inputs=input_layer, outputs=decoded)

# Compile the autoencoder
autoencoder.compile(optimizer=Adam(), loss='mean_squared_error')

# Train the autoencoder
autoencoder.fit(X_std_train, X_std_train,
                epochs=50,
                batch_size=256,
                shuffle=True,
                validation_data=(X_std_test, X_std_test))

# Build the encoder model for dimensionality reduction
encoder = Model(inputs=input_layer, outputs=encoded)

# Encode the test data
encoded_data = encoder.predict(X_std_test)

# Apply K-Means clustering on the encoded data
kmeans = KMeans(n_clusters=4, random_state=50)
clusters = kmeans.fit_predict(encoded_data)

# Calculate Silhouette Score
score = silhouette_score(encoded_data, clusters)
print('Silhouette Score:', score)

In [None]:
#Visualizing clustering using auto encoders

# Further reduce the dimensionality to 2D for visualization using PCA
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(encoded_data)

# Plot the resulting clusters
plt.figure(figsize=(8, 6))
plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=clusters, cmap='viridis', marker='o')
plt.colorbar()
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('Cluster Visualization')
plt.show()