In [None]:
from fastai.imports import *
import pandas as pd
import numpy as np
import psycopg2
from sqlalchemy import create_engine, text
import os
# import fastbook
# fastbook.setup_book()
# from fastbook import *
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split # Ensure DataLoader is from torch.utils.data
import torch # Keep this for general tensor operations
# import torch.optim as optim # ADD THIS
from torch.utils.data import TensorDataset, random_split
from sklearn.preprocessing import StandardScaler
np.set_printoptions(linewidth=130)


In [None]:
DB_USER = os.getenv('PG_USER', 'admin')
DB_PASSWORD = os.getenv('PG_PASSWORD', 'admin')
DB_HOST = os.getenv('PG_HOST', 'localhost')
DB_PORT = os.getenv('PG_PORT', '5432')
DB_NAME = os.getenv('PG_DB_NAME', 'SYAS')
TABLE_NAME = 'your_table_name'

def get_dataframe_from_postgres(table_name: str) -> pd.DataFrame:
    db_connection_str = f"postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
    engine = None
    df = pd.DataFrame()
    try:
        engine = create_engine(db_connection_str)
        sql_query = text(f"SELECT * FROM {table_name}")
        df = pd.read_sql(sql_query, engine)
    except ImportError:
        print("Error: psycopg2-binary is not installed. Please install it using 'pip install psycopg2-binary'")
    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        if engine:
            engine.dispose()
    return df

In [None]:
df_members = get_dataframe_from_postgres("members")
df_matches = get_dataframe_from_postgres("matches")

In [None]:
df_members.head()

In [None]:
df_members.drop(columns=['height_min'], inplace=True)
df_members.drop(columns=['height_max'], inplace=True)
df_members.drop(columns=['height'], inplace=True)

In [None]:
men_conditions = [
    (df_members['gender'] == 'Male') & (df_members['height_inches'] < 62),
    (df_members['gender'] == 'Male') & (df_members['height_inches'] >= 62) & (df_members['height_inches'] <= 64),
    (df_members['gender'] == 'Male') & (df_members['height_inches'] >= 65) & (df_members['height_inches'] <= 66),
    (df_members['gender'] == 'Male') & (df_members['height_inches'] >= 67) & (df_members['height_inches'] <= 69),
    (df_members['gender'] == 'Male') & (df_members['height_inches'] >= 70) & (df_members['height_inches'] <= 71),
    (df_members['gender'] == 'Male') & (df_members['height_inches'] >= 72) & (df_members['height_inches'] <= 74),
    (df_members['gender'] == 'Male') & (df_members['height_inches'] > 74)
]
men_choices = [
    'Very Short',
    'Short',
    'Below Average Height',
    'Average Height',
    'Above Average Height',
    'Tall',
    'Very Tall'
]

# Define conditions and choices for women
# These categories are designed to be proportional to men's, adjusted for typical female heights.
women_conditions = [
    (df_members['gender'] == 'Female') & (df_members['height_inches'] < 55),
    (df_members['gender'] == 'Female') & (df_members['height_inches'] >= 55) & (df_members['height_inches'] <= 57),
    (df_members['gender'] == 'Female') & (df_members['height_inches'] >= 58) & (df_members['height_inches'] <= 60),
    (df_members['gender'] == 'Female') & (df_members['height_inches'] >= 61) & (df_members['height_inches'] <= 63),
    (df_members['gender'] == 'Female') & (df_members['height_inches'] >= 64) & (df_members['height_inches'] <= 65),
    (df_members['gender'] == 'Female') & (df_members['height_inches'] >= 66) & (df_members['height_inches'] <= 68),
    (df_members['gender'] == 'Female') & (df_members['height_inches'] > 68)
]
women_choices = [
    'Very Short',
    'Short',
    'Below Average Height',
    'Average Height',
    'Above Average Height',
    'Tall',
    'Very Tall'
]

# Combine all conditions and choices
all_conditions = men_conditions + women_conditions
all_choices = men_choices + women_choices

# Create the new 'height_category' column using numpy.select
df_members['height_category'] = np.select(all_conditions, all_choices, default='Unknown')

In [None]:
df_members[['id', 'acceptance_rate']]

In [None]:
df_members[df_members['acceptance_rate'].notna()]

In [None]:
df_members[['id','acceptance_rate']][df_members.acceptance_rate.notna()]

In [None]:
import seaborn as sns

dep = 'acceptance_rate'
fig,axs = plt.subplots(1,2, figsize=(11,5))
sns.barplot(data=df_members, y=dep, x="gender", ax=axs[0]).set(title="Acceptance Rate") #avg acceptance rate
sns.countplot(data=df_members, x="gender", ax=axs[1]).set(title="Histogram"); #gender count

In [None]:
cats = ['id', 'country', 'city', 'state', 'gender', 'religious_orientation', 'ethnicity', 'baal_teshuva', 'cohen', 'female_convert', 'parents_convert', 'mother_maternal_grandmother_jewish', 'family_religious_background', 'describe_family_religious_background', 'female_hc', 'kosher', 'female_dress', 'male_hc', 'frequency_of_tefilah', 'male_shul_attendance', 'torah_study', 'watching_tv', 'going_out_to_movies', 'watching_movies_at_home', 'secular_education', 'emphasis_of_studies', 'jewish_education', 'study_in_israel', 'profession', 'job_description', 'eye_color', 'hair_color', 'body_type', 'mental_physical_disability', 'my_marriage_status', 'want_additional_children', 'can_marry_cohen', 'political_orientation', 'smoking_habits', 'how_active_are_you', 'plan_to_aliya', 'willing_to_relocate', 'pet_person', 'pet_i_own', 'additional_pet_i_own', 'native_language', 'languages_spoken', 'desired_marital_status', 'minimum_education_level', 'acceptable_for_match_to_have_children', 'acceptable_religious_orientation', 'acceptable_smoking_habits', 'ok_dating_someone_with_disability', 'acceptable_aliyah_responses', 'acceptable_kosher_observance', 'ok_dating_baal_teshuva', 'family_relgious_background', 'desired_torah_study', 'desired_female_hc', 'desired_female_dress', 'jewish_education_preference', 'body_type_preference', 'preference_regarding_ethnicity', 'preference_cultural_background', 'my_personality_traits', 'my_personality_go_out_to', 'favorite_music', 'physical_activities_interests', 'my_favorite_pastimes', 'looking_for_in_a_person', 'short_description_of_yourself', 'community_work', 'introvert_extravert', 'sensor_intuitive', 'thinker_feeler', 'judger_perceiver', 'approved', 'dating_status', 'colleges_universities', 'parents_convert_before_birth', 'elementary_school', 'location_i_grew_up', 'name_secondary_school', 'name_study_one_year', 'parent_location', 'complete_incomplete', 'photo', 'site', 'profile_last_modified_date', 'updated', 'acceptable_places_to_live', 'height_category', 'how_long_single']

conts = ['age', 'years_orthodox_baal_teshuva', 'times_divorced', 'how_many_children', 'number_live_with_you', 'age_of_youngest', 'number_of_siblings', 'age_min', 'age_max', 'min_height_inches', 'max_height_inches', 'height_inches', 'num_matches']
dep = 'acceptance_rate'

In [None]:
df_members.dropna(subset=['acceptance_rate'], inplace=True)

In [None]:
df_members[cats].dtypes

In [None]:
df_members.isna().sum()

In [None]:
import numpy as np
df_members.replace(r'^\s*$', np.nan, regex=True, inplace=True) # Replaces empty strings and strings with only whitespace
# Now df.isna().sum() should reflect these changes

In [None]:
df_members.isna().sum()

In [None]:
modes = df_members.mode().iloc[0]
modes

In [None]:
df_members.fillna(modes, inplace=True)

In [None]:
df_members.isna().sum()
df_members.dtypes

In [None]:
df_members['gender'].hist()

In [None]:
def proc_cats(df):
    existing_cats = [col for col in cats if col in df.columns]
    for cat in existing_cats:
        df[cat] = pd.Categorical(df[cat])
proc_cats(df_members)

In [None]:
df_members.gender.head()

In [None]:
df_members.gender.cat.codes.head()

In [None]:
from numpy import random
from sklearn.model_selection import train_test_split

random.seed(42)
trn_df,val_df = train_test_split(df_members, test_size=0.25)
trn_df[cats] = trn_df[cats].apply(lambda x: x.cat.codes)
val_df[cats] = val_df[cats].apply(lambda x: x.cat.codes)

trn_df.dtypes

In [None]:
val_df.dtypes

In [None]:
# df_members = df_members['number_live_with_you'].replace('N/A', '0')
val_df['number_live_with_you'] = val_df['number_live_with_you'].replace('N/A', '0')
val_df['number_live_with_you'].hist()

In [None]:
trn_df['number_live_with_you'] = trn_df['number_live_with_you'].replace('N/A', '0')
trn_df['number_live_with_you'].hist()

In [None]:
trn_df['times_divorced'].value_counts(), val_df['times_divorced'].value_counts()

In [None]:
trn_df['times_divorced'] = trn_df['times_divorced'].replace('>=1', '1')
val_df['times_divorced'] = val_df['times_divorced'].replace('>=1', '1')

In [None]:
trn_df['times_divorced'].value_counts(), val_df['times_divorced'].value_counts()


In [None]:
def remove_objects(df):
    object_cols = df.select_dtypes(include='object').columns

    for col in object_cols:
        df[col] = df[col].replace(r'^\s*$', np.nan, regex=True)
        df[col] = pd.to_numeric(df[col], errors='coerce')
        if df[col].isnull().any():
            df[col] = df[col].astype(pd.Int64Dtype())
        else:
            df[col] = df[col].astype(int)
    return df

In [None]:
trn_df = remove_objects(trn_df)
val_df = remove_objects(val_df)

In [None]:
trn_df['times_divorced'].value_counts(), val_df['times_divorced'].value_counts()

In [None]:
def xs_y(df, cats, conts):
    xs = df[cats+conts].copy()
    return xs,df[dep] if dep in df else None



In [None]:
trn_xs,trn_y = xs_y(trn_df, cats, conts)
val_xs,val_y = xs_y(val_df, cats, conts)

val_xs.head()

In [None]:
df_members.acceptance_rate[df_members.acceptance_rate > 0].plot(kind='kde')

In [None]:
df_members.acceptance_rate.quantile([0.25, 0.5, 0.75])

In [None]:
df_members.acceptance_rate.hist(density=True, bins=20)

In [None]:
df_members[df_members['gender'] == 'Male'].acceptance_rate.hist(density=True, bins=20)

In [None]:
df_members[df_members['gender'] == 'Male'].acceptance_rate.quantile([0.25, 0.5, 0.75])

In [None]:
df_members[df_members['gender'] == 'Female'].acceptance_rate.hist(density=True, bins=20)

In [None]:
df_members[df_members['gender'] == 'Female'].acceptance_rate.quantile([0.25, 0.5, 0.75])

In [None]:
df_members[df_members['body_type'] == 'Lean/Slender'].acceptance_rate.hist(density=True, bins=20)

In [None]:
df_members[df_members['body_type'] == 'Lean/Slender'].acceptance_rate.quantile([0.25, 0.5, 0.75])

In [None]:
df_members[df_members.height_inches >= 72 ].acceptance_rate.hist(density=True, bins=20)

In [None]:
df_members.groupby('religious_orientation', observed=False)['acceptance_rate'].quantile([.5])


In [None]:
df_members['body_type']

In [None]:
df_members.religious_orientation.head()

In [None]:
df_members.religious_orientation.cat.codes.head()

In [None]:
df_members.body_type.cat.codes.head()

In [None]:
avg_acceptance_rate = df_members.acceptance_rate.mean()
avg_acceptance_rate
avg_acceptance_rate_by_body_type = df_members.groupby('body_type')['acceptance_rate'].mean()
avg_acceptance_rate_by_body_type

In [None]:
# preds = val_xs.Sex==0
preds = [avg_acceptance_rate]*len(val_y)

In [None]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(val_y, preds)

In [None]:
avg_acceptance_rate_by_body_type = df_members.groupby('body_type')['acceptance_rate'].mean()
avg_acceptance_rate_by_body_type

In [None]:
preds = avg_acceptance_rate_by_body_type[val_xs.body_type]

In [None]:
mean_absolute_error(val_y, preds)

In [None]:
avg_acceptance_rate_by_religious_orientation = df_members.groupby('religious_orientation', observed = False)['acceptance_rate'].mean()
avg_acceptance_rate_by_religious_orientation

In [None]:
preds = avg_acceptance_rate_by_religious_orientation[val_xs.religious_orientation]

In [None]:
mean_absolute_error(val_y, preds)

In [None]:
df_members['height_category'].head()

In [None]:
avg_acceptance_rate_by_height_category = df_members.groupby('height_category', observed = False)['acceptance_rate'].mean()
avg_acceptance_rate_by_height_category

In [None]:
preds = avg_acceptance_rate_by_height_category[val_xs.height_category]

In [None]:
mean_absolute_error(val_y, preds)

In [None]:
def _side_score(side, y):
    tot = side.sum()
    if tot<=1: return 0
    return y[side].std()*tot

In [None]:
def score(col, y, split):
    lhs = col<=split
    return (_side_score(lhs,y) + _side_score(~lhs,y))/len(y)

In [None]:
score(trn_xs["gender"], trn_y, 0.5)

In [None]:
def min_col(df, nm):
    col,y = df[nm],df[dep]
    unq = col.dropna().unique()
    scores = np.array([score(col, y, o) for o in unq if not np.isnan(o)])
    idx = scores.argmin()
    return unq[idx],scores[idx]

In [None]:
cols = cats+conts
{o:min_col(trn_df, o) for o in cols}

In [None]:
trn_df.head()

In [None]:
df_members.head()

In [None]:
import numpy as np
df_members.replace(r'^\s*$', np.nan, regex=True, inplace=True) # Replaces empty strings and strings with only whitespace
# Now df.isna().sum() should reflect these changes

In [None]:
df_members.isna().sum()

In [None]:
from sklearn.tree import DecisionTreeRegressor, export_graphviz

m = DecisionTreeRegressor(max_leaf_nodes=4).fit(trn_xs, trn_y);

In [None]:
import graphviz

def draw_tree(t, df, size=10, ratio=0.6, precision=2, **kwargs):
    s=export_graphviz(t, out_file=None, feature_names=df.columns, filled=True, rounded=True,
                      special_characters=True, rotate=False, precision=precision, **kwargs)
    return graphviz.Source(re.sub('Tree {', f'Tree {{ size={size}; ratio={ratio}', s))

In [None]:
draw_tree(m, trn_xs, size=10)

In [None]:
def gini(cond):
    act = df_members.loc[cond, dep]
    return 1 - act.mean()**2 - (1-act).mean()**2

In [None]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(val_y, m.predict(val_xs))

In [None]:
m = DecisionTreeRegressor(min_samples_leaf=300)
m.fit(trn_xs, trn_y)
# draw_tree(m, trn_xs, size=25)

In [None]:
mean_absolute_error(val_y, m.predict(val_xs))

In [None]:
def get_tree(prop=0.75):
    n = len(trn_y)
    idxs = random.choice(n, int(n*prop))
    return DecisionTreeRegressor(min_samples_leaf=5).fit(trn_xs.iloc[idxs], trn_y.iloc[idxs])

In [None]:
trees = [get_tree() for t in range(100)]

In [None]:
all_probs = [t.predict(val_xs) for t in trees]
avg_probs = np.stack(all_probs).mean(0)

mean_absolute_error(val_y, avg_probs)

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(100, min_samples_leaf=5)
rf.fit(trn_xs, trn_y);
mean_absolute_error(val_y, rf.predict(val_xs))

In [None]:
df_importances = pd.DataFrame(dict(cols=trn_xs.columns, imp=m.feature_importances_))
df_filtered = df_importances[df_importances['imp'] > 0]

num_features = len(df_filtered)
desired_height = num_features * 0.4

fig, ax = plt.subplots(figsize=(10, desired_height))

df_filtered.plot(x='cols', y='imp', kind='barh', ax=ax)

plt.tight_layout()
plt.show()

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=4, min_samples_leaf=5, random_state=42)
gbr.fit(trn_xs, trn_y)
mean_absolute_error(val_y, gbr.predict(val_xs))

In [None]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, r2_score

In [None]:
r2_score(val_y, gbr.predict(val_xs))

In [None]:
r2_score(val_y, rf.predict(val_xs))

In [None]:
from sklearn.linear_model import SGDRegressor
# A basic linear regression setup with SGD
sgd_regressor = SGDRegressor(loss='squared_error',
                             penalty='l2',
                             alpha=0.0001,
                             max_iter=1000,
                             tol=1e-3,
                             random_state=42) # for reproducibility

In [None]:
sgd_regressor.fit(trn_xs, trn_y)
y_pred = sgd_regressor.predict(val_xs)

In [None]:
mean_absolute_error(val_y, y_pred)

In [None]:
r2_score(val_y, y_pred)

In [None]:
from sklearn.preprocessing import StandardScaler # Import StandardScaler
scaler = StandardScaler()
trn_xs_scaled = scaler.fit_transform(trn_xs)
val_xs_scaled = scaler.transform(val_xs)

sgd_regressor = SGDRegressor(loss='squared_error',
                             penalty='l2',
                             alpha=0.0001,
                             max_iter=1000,
                             tol=1e-3,
                             random_state=42)

sgd_regressor.fit(trn_xs_scaled, trn_y)

y_pred = sgd_regressor.predict(val_xs_scaled)

In [None]:
mean_absolute_error(val_y, y_pred)

In [None]:
r2_score(val_y, y_pred)

In [None]:
from sklearn.linear_model import Ridge
scaler = StandardScaler()
trn_xs_scaled = scaler.fit_transform(trn_xs)
val_xs_scaled = scaler.transform(val_xs)

ridge_model = Ridge(alpha=1.0, random_state=42)

ridge_model.fit(trn_xs_scaled, trn_y)

y_pred = ridge_model.predict(val_xs_scaled)


In [None]:
mean_absolute_error(val_y, y_pred)

In [None]:
r2_score(val_y, y_pred)

In [None]:
from sklearn.linear_model import RidgeCV
alphas_to_test = np.logspace(-3, 3, 100)

ridge_cv_model = RidgeCV(alphas=alphas_to_test, cv=5, scoring='neg_mean_absolute_error')

ridge_cv_model.fit(trn_xs_scaled, trn_y)

best_alpha = ridge_cv_model.alpha_

y_pred_cv = ridge_cv_model.predict(val_xs_scaled)

In [None]:
mean_absolute_error(val_y, y_pred_cv)

In [None]:
r2_score(val_y, y_pred_cv)

In [None]:
from xgboost import XGBRegressor
scaler = StandardScaler()
trn_xs_scaled = scaler.fit_transform(trn_xs)
val_xs_scaled = scaler.transform(val_xs)

xgb_model = XGBRegressor(objective='reg:squarederror',
                         n_estimators=150,
                         learning_rate=0.1,

                         random_state=42)

xgb_model.fit(trn_xs_scaled, trn_y)

y_pred = xgb_model.predict(val_xs_scaled)

In [None]:
mean_absolute_error(val_y, y_pred)

In [None]:
df_importances = pd.DataFrame(dict(cols=trn_xs.columns, imp=xgb_model.feature_importances_))
df_filtered = df_importances[df_importances['imp'] > 0]

num_features = len(df_filtered)
desired_height = num_features * 0.4

fig, ax = plt.subplots(figsize=(10, desired_height))

df_filtered.plot(x='cols', y='imp', kind='barh', ax=ax)

plt.tight_layout()
plt.show()

In [None]:
features_to_keep = df_importances[df_importances['imp'] > 0]['cols'].tolist()
features_to_keep

In [None]:
df_members_filtered_features = df_members[features_to_keep].copy()
proc_cats(df_members_filtered_features)
df_members_filtered_features['acceptance_rate'] = df_members['acceptance_rate']
df_members_filtered_features['number_live_with_you'] = df_members_filtered_features['number_live_with_you'].replace('N/A', 0)
df_members_filtered_features.replace(r'^\s*$', np.nan, regex=True, inplace=True)
df_members_filtered_features.columns

In [None]:
cats = ['id', 'country', 'city', 'state', 'gender', 'religious_orientation', 'ethnicity', 'baal_teshuva', 'cohen', 'female_convert', 'parents_convert', 'mother_maternal_grandmother_jewish', 'family_religious_background', 'describe_family_religious_background', 'female_hc', 'kosher', 'female_dress', 'male_hc', 'frequency_of_tefilah', 'male_shul_attendance', 'torah_study', 'watching_tv', 'going_out_to_movies', 'watching_movies_at_home', 'secular_education', 'emphasis_of_studies', 'jewish_education', 'study_in_israel', 'profession', 'job_description', 'eye_color', 'hair_color', 'body_type', 'mental_physical_disability', 'my_marriage_status', 'want_additional_children', 'can_marry_cohen', 'political_orientation', 'smoking_habits', 'how_active_are_you', 'plan_to_aliya', 'willing_to_relocate', 'pet_person', 'pet_i_own', 'additional_pet_i_own', 'native_language', 'languages_spoken', 'desired_marital_status', 'minimum_education_level', 'acceptable_for_match_to_have_children', 'acceptable_religious_orientation', 'acceptable_smoking_habits', 'ok_dating_someone_with_disability', 'acceptable_aliyah_responses', 'acceptable_kosher_observance', 'ok_dating_baal_teshuva', 'family_relgious_background', 'desired_torah_study', 'desired_female_hc', 'desired_female_dress', 'jewish_education_preference', 'body_type_preference', 'preference_regarding_ethnicity', 'preference_cultural_background', 'my_personality_traits', 'my_personality_go_out_to', 'favorite_music', 'physical_activities_interests', 'my_favorite_pastimes', 'looking_for_in_a_person', 'short_description_of_yourself', 'community_work', 'introvert_extravert', 'sensor_intuitive', 'thinker_feeler', 'judger_perceiver', 'approved', 'dating_status', 'colleges_universities', 'parents_convert_before_birth', 'elementary_school', 'location_i_grew_up', 'name_secondary_school', 'name_study_one_year', 'parent_location', 'complete_incomplete', 'photo', 'site', 'profile_last_modified_date', 'updated', 'acceptable_places_to_live', 'height_category', 'how_long_single']

conts = ['age', 'years_orthodox_baal_teshuva', 'times_divorced', 'how_many_children', 'number_live_with_you', 'age_of_youngest', 'number_of_siblings', 'age_min', 'age_max', 'min_height_inches', 'max_height_inches', 'height_inches', 'num_matches']

In [None]:
trn_df,val_df = train_test_split(df_members_filtered_features, test_size=0.25)

existing_cats_in_trn_df = [col for col in cats if col in trn_df.columns]
existing_conts_in_trn_df = [col for col in conts if col in trn_df.columns]

trn_df[existing_cats_in_trn_df] = trn_df[existing_cats_in_trn_df].apply(lambda x: x.cat.codes)
val_df[existing_cats_in_trn_df] = val_df[existing_cats_in_trn_df].apply(lambda x: x.cat.codes)

trn_df = remove_objects(trn_df)
val_df = remove_objects(val_df)

trn_xs,trn_y = xs_y(trn_df, existing_cats_in_trn_df, existing_conts_in_trn_df)
val_xs,val_y = xs_y(val_df, existing_cats_in_trn_df, existing_conts_in_trn_df)


In [None]:
scaler = StandardScaler()
trn_xs_scaled = scaler.fit_transform(trn_xs)
val_xs_scaled = scaler.transform(val_xs)

xgb_model = XGBRegressor(objective='reg:squarederror',
                        n_estimators=100,
                        learning_rate=0.1,
                        random_state=42)

xgb_model.fit(trn_xs_scaled, trn_y)

y_pred = xgb_model.predict(val_xs_scaled)

In [None]:
mean_absolute_error(val_y, y_pred)

In [None]:
trn_xs.isnull().sum()

In [None]:
trn_xs.replace(r'^\s*$', np.nan, regex=True, inplace=True)
trn_xs.fillna(0, inplace=True)
trn_xs.isnull().sum()
val_xs.fillna(0, inplace=True)

In [None]:
gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=4, min_samples_leaf=5, random_state=42)
gbr.fit(trn_xs, trn_y)
mean_absolute_error(val_y, gbr.predict(val_xs))

In [None]:
from sklearn.linear_model import Lasso, LassoCV

In [None]:
def feature_engineering_prediction(rate):
    features_to_keep = df_importances[df_importances['imp'] > rate]['cols'].tolist()
    df_members_filtered_features = df_members[features_to_keep].copy()
    proc_cats(df_members_filtered_features)
    df_members_filtered_features['acceptance_rate'] = df_members['acceptance_rate']
    if 'number_live_with_you' in df_members_filtered_features.columns:
        df_members_filtered_features['number_live_with_you'] = df_members_filtered_features['number_live_with_you'].replace('N/A', 0)
    df_members_filtered_features.replace(r'^\s*$', np.nan, regex=True, inplace=True)

    trn_df,val_df = train_test_split(df_members_filtered_features, test_size=0.25)

    existing_cats_in_trn_df = [col for col in cats if col in trn_df.columns]
    existing_conts_in_trn_df = [col for col in conts if col in trn_df.columns]

    trn_df[existing_cats_in_trn_df] = trn_df[existing_cats_in_trn_df].apply(lambda x: x.cat.codes)
    val_df[existing_cats_in_trn_df] = val_df[existing_cats_in_trn_df].apply(lambda x: x.cat.codes)

    trn_df = remove_objects(trn_df)
    val_df = remove_objects(val_df)

    trn_xs,trn_y = xs_y(trn_df, existing_cats_in_trn_df, existing_conts_in_trn_df)
    val_xs,val_y = xs_y(val_df, existing_cats_in_trn_df, existing_conts_in_trn_df)

    trn_xs.fillna(0, inplace=True)
    val_xs.fillna(0, inplace=True)


    # sgd_regressor.fit(trn_xs, trn_y)
    # y_pred = sgd_regressor.predict(val_xs)
    # print(f'SGD MAE: {mean_absolute_error(val_y, y_pred)}')

    # rf = RandomForestRegressor(100, min_samples_leaf=5)
    # rf.fit(trn_xs, trn_y);
    # print(f'Random Forest MAE: {mean_absolute_error(val_y, rf.predict(val_xs))}')

    gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, min_samples_leaf=5, random_state=42)
    gbr.fit(trn_xs, trn_y)
    mean_absolute_error(val_y, gbr.predict(val_xs))
    print(f'Gradient Boost MAE: {mean_absolute_error(val_y, gbr.predict(val_xs))}')

    scaler = StandardScaler()
    trn_xs_scaled = scaler.fit_transform(trn_xs)
    val_xs_scaled = scaler.transform(val_xs)

    xgb_model = XGBRegressor(objective='reg:squarederror',
                             n_estimators=150,
                             learning_rate=0.1,
                             random_state=42)

    xgb_model.fit(trn_xs_scaled, trn_y)

    y_pred = xgb_model.predict(val_xs_scaled)
    print(f'XGBoost MAE: {mean_absolute_error(val_y, y_pred)}')

    lasso = Lasso()
    lasso.fit(trn_xs, trn_y)
    print(f'Lasso MAE: {mean_absolute_error(val_y, lasso.predict(val_xs))}')

    lcv = LassoCV(cv=100, random_state=42)
    lcv.fit(trn_xs, trn_y)
    print(f'LassoCV MAE: {mean_absolute_error(val_y, lcv.predict(val_xs))}')

In [None]:
feature_engineering_prediction(-1)