In [2]:
# to handle datasets
import pandas as pd
import numpy as np

# to divide train and test set
from sklearn.model_selection import train_test_split

# to visualise al the columns in the dataframe
pd.pandas.set_option('display.max_columns', None)

In [3]:
# load dataset
data = pd.read_csv('/home/jupyter/dyson-test/old-data/dataset/cali_ces.csv')

# rows and columns of the data
print(data.shape)

print(data.columns)

# visualize the dataset
data.head()

(20640, 10)
Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_ces', 'ocean_proximity'],
      dtype='object')


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_ces,ocean_proximity
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,NEAR BAY
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500,NEAR BAY
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,352100,NEAR BAY
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,341300,NEAR BAY
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,342200,NEAR BAY


In [5]:
# Perform feature engineering
# Create a new feature "total_rooms_per_person"
df = data.copy()
df['total_rooms_per_person'] = df['total_rooms']/df['population']
# Create a new feature "bedrooms_per_room"
df['bedrooms_per_room'] = df['total_bedrooms']/df['total_rooms']
# Create a new feature "income_per_person"
df['income_per_person'] = df['median_income']/data['population']
# Encode the categorical feature "ocean_proximity"
one_hot_encoded = pd.get_dummies(df.ocean_proximity,prefix='ocean_proximity')
one_hot_encoded = one_hot_encoded.astype(int)
df = pd.concat([df, one_hot_encoded], axis=1)
df.drop(['ocean_proximity'], axis=1, inplace=True)

# Split the DataFrame into features (X) and target variable (y)
X = df.drop(columns=['median_ces'])
y = df['median_ces']



# Split the features and target variable into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# identifying the numerical variables

num_vars = [
    var for var in X_train.columns if var != 'median_ces'
]

# number of numerical variables
len(num_vars)

# making a list with the numerical variables that contain missing values
vars_with_na = [
    var for var in num_vars
    if X_train[var].isnull().sum() > 0
]

# print percentage of missing values per variable
X_train[vars_with_na].isnull().mean()

# replacing missing values

for var in vars_with_na:

    # calculate the mean using the train set
    mean_val = X_train[var].mean()
    
    print(var, mean_val)

    # add binary missing indicator (in train and test)
    X_train[var + '_na'] = np.where(X_train[var].isnull(), 1, 0)
    X_test[var + '_na'] = np.where(X_test[var].isnull(), 1, 0)

    # replace missing values by the mean
    # (in train and test)
    X_train[var].fillna(mean_val, inplace=True)
    X_test[var].fillna(mean_val, inplace=True)

# check that we have no more missing values in the engineered variables
X_train[vars_with_na].isnull().sum()
# check that we have no more missing values in the engineered variables
X_test[vars_with_na].isnull().sum()

# Combine X_train and y_train into a training DataFrame
train_df = pd.concat([X_train, y_train], axis=1)

# Save the training DataFrame to a CSV file
train_df.to_csv('train.csv', index=False)

# Save the testing DataFrame to a CSV file
X_test.to_csv('test.csv', index=False)




# data

# # Split the DataFrame into training and testing sets
# train_df, test_df = train_test_split(df, test_size=0.2, random_state=0)

# # Save the training and testing DataFrames to CSV files
# train_df.to_csv('train.csv', index=False)
# test_df.to_csv('test.csv', index=False)

total_bedrooms 537.6391096979332
bedrooms_per_room 0.21279519741968828


In [1]:
!pwd

/home/jupyter/dyson-test/old-data/dataset


In [13]:
train_df
# train_df.shape

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,total_rooms_per_person,bedrooms_per_room,income_per_person,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,median_ces
12069,-117.55,33.83,6,502,76.0,228,65,4.2386,2.201754,0.151394,0.018590,0,1,0,0,0,500001
15925,-122.44,37.73,52,2381,492.0,1485,447,4.3898,1.603367,0.206636,0.002956,0,0,0,1,0,270000
11162,-118.00,33.83,26,1718,385.0,1022,368,3.9333,1.681018,0.224098,0.003849,1,0,0,0,0,196100
4904,-118.26,34.01,38,697,208.0,749,206,1.4653,0.930574,0.298422,0.001956,1,0,0,0,0,118800
4683,-118.36,34.08,52,2373,601.0,1135,576,3.1765,2.090749,0.253266,0.002799,1,0,0,0,0,225000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13123,-121.26,38.27,20,1314,229.0,712,219,4.4125,1.845506,0.174277,0.006197,0,1,0,0,0,144600
19648,-120.89,37.48,27,1118,195.0,647,209,2.9135,1.727975,0.174419,0.004503,0,1,0,0,0,159400
9845,-121.90,36.58,31,1431,,704,393,3.1977,2.032670,,0.004542,0,0,0,0,1,289300
10799,-117.93,33.62,34,2125,498.0,1052,468,5.6315,2.019962,0.234353,0.005353,1,0,0,0,0,484600


In [14]:
X_test
# X_test.shape


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,total_rooms_per_person,bedrooms_per_room,income_per_person,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
14740,-117.05,32.58,22,2101,399.0,1551,371,4.1518,1.354610,0.189910,0.002677,0,0,0,0,1
10101,-117.97,33.92,32,2620,398.0,1296,429,5.7796,2.021605,0.151908,0.004460,1,0,0,0,0
20566,-121.84,38.65,29,3167,548.0,1554,534,4.3487,2.037967,0.173034,0.002798,0,1,0,0,0
2670,-115.60,33.20,37,709,187.0,390,142,2.4511,1.817949,0.263752,0.006285,0,1,0,0,0
15709,-122.43,37.79,25,1637,394.0,649,379,5.0049,2.522342,0.240684,0.007712,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6655,-118.13,34.16,33,2682,716.0,2050,692,2.4817,1.308293,0.266965,0.001211,1,0,0,0,0
3505,-118.45,34.25,36,1453,270.0,808,275,4.3839,1.798267,0.185822,0.005426,1,0,0,0,0
1919,-120.92,38.86,11,1720,345.0,850,326,3.2027,2.023529,0.200581,0.003768,0,1,0,0,0
1450,-121.95,37.96,18,2739,393.0,1072,374,6.1436,2.555037,0.143483,0.005731,0,1,0,0,0


In [15]:
y_train
# y_train.shape


12069    500001
15925    270000
11162    196100
4904     118800
4683     225000
          ...  
13123    144600
19648    159400
9845     289300
10799    484600
2732      69400
Name: median_ces, Length: 16512, dtype: int64

In [16]:
y_test
# y_test.shape

14740    136900
10101    241300
20566    200700
2670      72500
15709    460000
          ...  
6655     169500
3505     204600
1919     128600
1450     259500
4148     167600
Name: median_ces, Length: 4128, dtype: int64