In [30]:
import pandas as pd
from sklearn.calibration import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, RepeatedKFold, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, roc_curve, roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, VarianceThreshold, f_classif, f_regression
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SequentialFeatureSelector

In [31]:
df1 = pd.read_csv('train.csv')
df2 = pd.read_csv('test.csv')

df1 = df1.drop(columns=['sub_area'])
df2_dropped = df2.drop(columns=['sub_area','row ID'])

In [32]:
categorical_columns = df1.select_dtypes(include=['object']).columns

#Label encode categorical columns
label_encoder = LabelEncoder()
for col in categorical_columns:
  df1[col] = label_encoder.fit_transform(df1[col])
  df2_dropped[col] = label_encoder.transform(df2_dropped[col])


#df1_encoded = df1_encoded.drop(columns=['area_m', 'raion_popul', 'green_zone_part', 'indust_part', 'children_preschool', 'preschool_education_centers_raion', 'children_school', 'school_education_centers_raion', 'healthcare_centers_raion', 'sport_objects_raion', 'additional_education_raion', 'shopping_centers_raion', 'full_all', 'male_f', 'female_f', 'young_all', 'young_male', 'young_female', 'work_all', 'work_male', 'work_female', 'ekder_all', 'ekder_male', 'ekder_female', '0_6_all', '0_6_male', '0_6_female', '7_14_all', '7_14_male', '7_14_female', '0_17_all', '0_17_male', '0_17_female', '16_29_all', '16_29_male', '16_29_female', '0_13_all', '0_13_male', '0_13_female', 'raion_build_count_with_material_info', 'build_count_block', 'build_count_wood', 'build_count_frame', 'build_count_brick', 'build_count_panel', 'build_count_mix', 'raion_build_count_with_builddate_info', 'build_count_1921-1945', 'build_count_1946-1970', 'build_count_1971-1995', 'ID_metro', 'metro_min_avto', 'metro_km_avto', 'metro_min_walk', 'metro_km_walk', 'park_km', 'green_zone_km', 'water_treatment_km', 'cemetery_km', 'incineration_km', 'railroad_station_walk_km', 'railroad_station_walk_min', 'ID_railroad_station_walk', 'railroad_station_avto_km', 'railroad_station_avto_min', 'ID_railroad_station_avto', 'water_km', 'mkad_km', 'ttk_km', 'sadovoe_km', 'bulvar_ring_km', 'kremlin_km', 'big_road1_km', 'ID_big_road1', 'big_road2_km', 'ID_big_road2', 'railroad_km', 'zd_vokzaly_avto_km', 'ID_railroad_terminal', 'bus_terminal_avto_km', 'ID_bus_terminal', 'oil_chemistry_km', 'nuclear_reactor_km', 'radiation_km', 'power_transmission_line_km', 'thermal_power_plant_km', 'ts_km', 'big_market_km', 'market_shop_km', 'swim_pool_km', 'ice_rink_km', 'stadium_km', 'basketball_km', 'hospice_morgue_km', 'detention_facility_km', 'university_km', 'workplaces_km', 'shopping_centers_km', 'office_km', 'mosque_km', 'theater_km', 'museum_km', 'exhibition_km', 'catering_km', 'green_part_500', 'green_part_1000', 'prom_part_1000', 'green_part_1500', 'prom_part_1500', 'sport_count_1500', 'market_count_1500', 'green_part_2000', 'prom_part_2000', 'trc_count_2000', 'cafe_sum_2000_min_price_avg', 'cafe_sum_2000_max_price_avg', 'cafe_avg_price_2000', 'mosque_count_2000', 'sport_count_2000', 'market_count_2000', 'green_part_3000', 'prom_part_3000', 'office_sqm_3000', 'trc_count_3000', 'trc_sqm_3000', 'cafe_sum_3000_min_price_avg', 'cafe_sum_3000_max_price_avg', 'cafe_avg_price_3000', 'mosque_count_3000', 'sport_count_3000', 'market_count_3000', 'green_part_5000', 'prom_part_5000', 'office_count_5000', 'office_sqm_5000', 'trc_count_5000', 'trc_sqm_5000', 'cafe_count_5000', 'cafe_sum_5000_min_price_avg', 'cafe_sum_5000_max_price_avg', 'cafe_avg_price_5000', 'cafe_count_5000_na_price', 'cafe_count_5000_price_500', 'cafe_count_5000_price_1000', 'cafe_count_5000_price_1500', 'cafe_count_5000_price_2500', 'cafe_count_5000_price_4000', 'cafe_count_5000_price_high', 'big_church_count_5000', 'church_count_5000', 'mosque_count_5000', 'leisure_count_5000', 'sport_count_5000', 'market_count_5000', 'product_type_OwnerOccupier', 'culture_objects_top_25_yes', 'thermal_power_plant_raion_yes', 'incineration_raion_yes', 'oil_chemistry_raion_yes', 'radiation_raion_yes', 'railroad_terminal_raion_yes', 'big_market_raion_yes', 'nuclear_reactor_raion_yes', 'detention_facility_raion_yes', 'water_1line_yes', 'big_road1_1line_yes', 'railroad_1line_yes', 'ecology_good', 'ecology_no data', 'ecology_poor', 'ecology_satisfactory'],axis=1)
#df2_encoded = df2_encoded.drop(columns=['area_m', 'raion_popul', 'green_zone_part', 'indust_part', 'children_preschool', 'preschool_education_centers_raion', 'children_school', 'school_education_centers_raion', 'healthcare_centers_raion', 'sport_objects_raion', 'additional_education_raion', 'shopping_centers_raion', 'full_all', 'male_f', 'female_f', 'young_all', 'young_male', 'young_female', 'work_all', 'work_male', 'work_female', 'ekder_all', 'ekder_male', 'ekder_female', '0_6_all', '0_6_male', '0_6_female', '7_14_all', '7_14_male', '7_14_female', '0_17_all', '0_17_male', '0_17_female', '16_29_all', '16_29_male', '16_29_female', '0_13_all', '0_13_male', '0_13_female', 'raion_build_count_with_material_info', 'build_count_block', 'build_count_wood', 'build_count_frame', 'build_count_brick', 'build_count_panel', 'build_count_mix', 'raion_build_count_with_builddate_info', 'build_count_1921-1945', 'build_count_1946-1970', 'build_count_1971-1995', 'ID_metro', 'metro_min_avto', 'metro_km_avto', 'metro_min_walk', 'metro_km_walk', 'park_km', 'green_zone_km', 'water_treatment_km', 'cemetery_km', 'incineration_km', 'railroad_station_walk_km', 'railroad_station_walk_min', 'ID_railroad_station_walk', 'railroad_station_avto_km', 'railroad_station_avto_min', 'ID_railroad_station_avto', 'water_km', 'mkad_km', 'ttk_km', 'sadovoe_km', 'bulvar_ring_km', 'kremlin_km', 'big_road1_km', 'ID_big_road1', 'big_road2_km', 'ID_big_road2', 'railroad_km', 'zd_vokzaly_avto_km', 'ID_railroad_terminal', 'bus_terminal_avto_km', 'ID_bus_terminal', 'oil_chemistry_km', 'nuclear_reactor_km', 'radiation_km', 'power_transmission_line_km', 'thermal_power_plant_km', 'ts_km', 'big_market_km', 'market_shop_km', 'swim_pool_km', 'ice_rink_km', 'stadium_km', 'basketball_km', 'hospice_morgue_km', 'detention_facility_km', 'university_km', 'workplaces_km', 'shopping_centers_km', 'office_km', 'mosque_km', 'theater_km', 'museum_km', 'exhibition_km', 'catering_km', 'green_part_500', 'green_part_1000', 'prom_part_1000', 'green_part_1500', 'prom_part_1500', 'sport_count_1500', 'market_count_1500', 'green_part_2000', 'prom_part_2000', 'trc_count_2000', 'cafe_sum_2000_min_price_avg', 'cafe_sum_2000_max_price_avg', 'cafe_avg_price_2000', 'mosque_count_2000', 'sport_count_2000', 'market_count_2000', 'green_part_3000', 'prom_part_3000', 'office_sqm_3000', 'trc_count_3000', 'trc_sqm_3000', 'cafe_sum_3000_min_price_avg', 'cafe_sum_3000_max_price_avg', 'cafe_avg_price_3000', 'mosque_count_3000', 'sport_count_3000', 'market_count_3000', 'green_part_5000', 'prom_part_5000', 'office_count_5000', 'office_sqm_5000', 'trc_count_5000', 'trc_sqm_5000', 'cafe_count_5000', 'cafe_sum_5000_min_price_avg', 'cafe_sum_5000_max_price_avg', 'cafe_avg_price_5000', 'cafe_count_5000_na_price', 'cafe_count_5000_price_500', 'cafe_count_5000_price_1000', 'cafe_count_5000_price_1500', 'cafe_count_5000_price_2500', 'cafe_count_5000_price_4000', 'cafe_count_5000_price_high', 'big_church_count_5000', 'church_count_5000', 'mosque_count_5000', 'leisure_count_5000', 'sport_count_5000', 'market_count_5000', 'product_type_OwnerOccupier', 'culture_objects_top_25_yes', 'thermal_power_plant_raion_yes', 'incineration_raion_yes', 'oil_chemistry_raion_yes', 'radiation_raion_yes', 'railroad_terminal_raion_yes', 'big_market_raion_yes', 'nuclear_reactor_raion_yes', 'detention_facility_raion_yes', 'water_1line_yes', 'big_road1_1line_yes', 'railroad_1line_yes', 'ecology_good', 'ecology_no data', 'ecology_poor', 'ecology_satisfactory'],axis=1)
print(df2.shape)

(77789, 272)


In [42]:
X_scaled = X_scaled[['full_sq', 'floor', 'build_count_monolith', 'industrial_km', 'trc_sqm_500',
 'mosque_count_500', 'leisure_count_500', 'office_sqm_1000',
 'cafe_count_1000_price_high', 'leisure_count_1000', 'power_transmission_line_km', 'big_market_km', 'public_healthcare_km', 'workplaces_km']]
df2_encoded_scaled = df2_encoded_scaled[['full_sq', 'floor', 'build_count_monolith', 'industrial_km', 'trc_sqm_500',
 'mosque_count_500', 'leisure_count_500', 'office_sqm_1000',
 'cafe_count_1000_price_high', 'leisure_count_1000', 'power_transmission_line_km', 'big_market_km', 'public_healthcare_km', 'workplaces_km']]

used.append("Forward Feature Selection (n=14)")

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [None]:
print(df2.head())

In [33]:
X = df1.drop(columns=['price_doc',],axis=1)
y = df1['price_doc']


In [34]:
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)
df2_encoded = imputer.fit_transform(df2_dropped)

In [35]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
df2_encoded_scaled = scaler.fit_transform(df2_encoded)

In [None]:

variance_threshold = 0.02 # Set your desired threshold
selector = VarianceThreshold(threshold=variance_threshold)
X_train_high_variance = selector.fit_transform(X_scaled)
X_test_high_variance = selector.transform(df2_encoded_scaled)

Selected Features:
['x1', 'x2', 'x3', 'x4', 'x9', 'x10', 'x13', 'x14', 'x15', 'x18', 'x23', 'x24', 'x27', 'x28', 'x31', 'x35', 'x44', 'x62', 'x67', 'x71', 'x72', 'x78', 'x81', 'x82', 'x84', 'x93', 'x94', 'x95', 'x96', 'x98', 'x103', 'x108', 'x109', 'x113', 'x121', 'x124', 'x132', 'x135', 'x140', 'x141', 'x142', 'x143', 'x147', 'x151', 'x152', 'x154', 'x155', 'x158', 'x159', 'x161', 'x162', 'x163', 'x164', 'x165', 'x166', 'x167', 'x172', 'x175', 'x176', 'x178', 'x179', 'x181', 'x182', 'x186', 'x197', 'x198', 'x199', 'x207', 'x210', 'x215', 'x217', 'x218', 'x221', 'x223', 'x249', 'x253', 'x267']
77


AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [None]:
model = RandomForestRegressor(n_estimators=200, max_depth=14, min_samples_leaf=25, min_samples_split=15,max_features=15,n_jobs=-1)
#model = RandomForestRegressor()

model.fit(X_train_high_variance,y)
predictions = model.predict(X_test_high_variance)



print(predictions)

In [None]:
submission_df = pd.DataFrame({'row ID': df2['row ID'], 'price_doc': predictions.flatten()})

# Save the results to a CSV file
submission_df.to_csv('predictions79.csv', index=False)