In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVR
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split


# Exploration

In [6]:
airbnb = pd.read_csv("airbnb_train.csv")
airbnb.head()

Unnamed: 0,id,log_price,property_type,room_type,amenities,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,...,last_review,latitude,longitude,name,neighbourhood,number_of_reviews,review_scores_rating,zipcode,bedrooms,beds
0,5708593,4.317488,House,Private room,"{TV,""Wireless Internet"",Kitchen,""Free parking ...",3,1.0,Real Bed,flexible,False,...,,33.782712,-118.13441,Island style Spa Studio,Long Beach,0,,90804,0.0,2.0
1,14483613,4.007333,House,Private room,"{""Wireless Internet"",""Air conditioning"",Kitche...",4,2.0,Real Bed,strict,False,...,2017-09-17,40.705468,-73.909439,"Beautiful and Simple Room W/2 Beds, 25 Mins to...",Ridgewood,38,86.0,11385,1.0,2.0
2,10412649,7.090077,Apartment,Entire home/apt,"{TV,""Wireless Internet"",""Air conditioning"",Kit...",6,2.0,Real Bed,flexible,False,...,,38.917537,-77.031651,2br/2ba luxury condo perfect for infant / toddler,U Street Corridor,0,,20009,2.0,2.0
3,17954362,3.555348,House,Private room,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",1,1.0,Real Bed,flexible,True,...,2017-09-29,40.736001,-73.924248,Manhattan view from Queens. Lovely single room .,Sunnyside,19,96.0,11104,1.0,1.0
4,9969781,5.480639,House,Entire home/apt,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",4,1.0,Real Bed,moderate,True,...,2017-08-28,37.744896,-122.430665,Zen Captured Noe Valley House,Noe Valley,15,96.0,94131,2.0,2.0


In [7]:
airbnb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22234 entries, 0 to 22233
Data columns (total 28 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      22234 non-null  int64  
 1   log_price               22234 non-null  float64
 2   property_type           22234 non-null  object 
 3   room_type               22234 non-null  object 
 4   amenities               22234 non-null  object 
 5   accommodates            22234 non-null  int64  
 6   bathrooms               22183 non-null  float64
 7   bed_type                22234 non-null  object 
 8   cancellation_policy     22234 non-null  object 
 9   cleaning_fee            22234 non-null  bool   
 10  city                    22234 non-null  object 
 11  description             22234 non-null  object 
 12  first_review            17509 non-null  object 
 13  host_has_profile_pic    22178 non-null  object 
 14  host_identity_verified  22178 non-null

# Entrainement

In [21]:
class CustomTransformation():
    def __init__(self):
        """
        Classe simple pour convertir les types de propriétés, de chambres, de politiques d'annulation
        et de types de lits en indices numériques, utilisable pour un algorithme de machine learning.
        """
        self.fitted = False  # Indique si fit_transform a été utilisé
        self.property2index = dict()  # Dictionnaire pour convertir les types de propriétés en indices
        self.room2index = dict()  # Dictionnaire pour convertir les types de chambres en indices
        self.cancellation_mapping = {  # Mapping pour cancellation_policy
            "flexible": 4,
            "moderate": 3,
            "strict": 2,
            "super_strict_30": 1,
            "super_strict_60": 0
        }
        self.bed_type_mapping = {  # Mapping pour bed_type
            "Real Bed": 4,
            "Pull-out Sofa": 3,
            "Futon": 2,
            "Airbed": 1,
            "Couch": 0
        }
        self.max_property_index = 0  # Dernier indice pour les types de propriétés
        self.max_room_index = 0  # Dernier indice pour les types de chambres

    def fit_transform(self, dataset):
        self.fitted = True

        # Transformation pour property_type
        properties = dataset["property_type"].unique()
        print("Tous les types de propriétés : ", properties)
        self.property2index = {prop: i for i, prop in enumerate(properties)}
        self.max_property_index = max(self.property2index.values())

        # Transformation pour room_type
        rooms = dataset["room_type"].unique()
        print("Tous les types de chambres : ", rooms)
        self.room2index = {room: i for i, room in enumerate(rooms)}
        self.max_room_index = max(self.room2index.values())

        # Appliquer les transformations
        return self.transform(dataset)

    def transform(self, dataset):
        # Transforme property_type en indices
        dataset.loc[:, "property_type"] = dataset["property_type"].replace(self.property2index)

        # Transforme room_type en indices
        dataset.loc[:, "room_type"] = dataset["room_type"].replace(self.room2index)

        # Transforme cancellation_policy en indices
        dataset.loc[:, "cancellation_policy"] = dataset["cancellation_policy"].replace(self.cancellation_mapping)

        # Transforme bed_type en indices
        dataset.loc[:, "bed_type"] = dataset["bed_type"].replace(self.bed_type_mapping)

        # Gère les valeurs inconnues pour property_type
        dataset.loc[dataset["property_type"].map(type).eq(str), "property_type"] = np.nan
        dataset["property_type"].fillna(self.max_property_index + 1, inplace=True)

        # Gère les valeurs inconnues pour room_type
        dataset.loc[dataset["room_type"].map(type).eq(str), "room_type"] = np.nan
        dataset["room_type"].fillna(self.max_room_index + 1, inplace=True)

        # Gère les valeurs inconnues pour cancellation_policy
        dataset["cancellation_policy"].fillna(-1, inplace=True)  # Remplace les valeurs manquantes par -1

        # Gère les valeurs inconnues pour bed_type
        dataset["bed_type"].fillna(-1, inplace=True)  # Remplace les valeurs manquantes par -1

        # Remplace les valeurs nulles dans d'autres colonnes
        dataset["bathrooms"].fillna(0, inplace=True)
        dataset["accommodates"].fillna(0, inplace=True)

        return dataset

In [22]:
features_transformer = CustomTransformation()
airbnb_train = features_transformer.fit_transform(airbnb)
airbnb_train.head(10)

Tous les types de propriétés :  [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
 28 29 30]
Tous les types de chambres :  [0 1 2]


  dataset.loc[:, "property_type"] = dataset["property_type"].replace(self.property2index)
  dataset.loc[:, "room_type"] = dataset["room_type"].replace(self.room2index)
  dataset.loc[:, "cancellation_policy"] = dataset["cancellation_policy"].replace(self.cancellation_mapping)
  dataset.loc[:, "bed_type"] = dataset["bed_type"].replace(self.bed_type_mapping)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset["property_type"].fillna(self.max_property_index + 1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when

Unnamed: 0,id,log_price,property_type,room_type,amenities,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,...,last_review,latitude,longitude,name,neighbourhood,number_of_reviews,review_scores_rating,zipcode,bedrooms,beds
0,5708593,4.317488,0,0,"{TV,""Wireless Internet"",Kitchen,""Free parking ...",3,1.0,4,0,False,...,,33.782712,-118.13441,Island style Spa Studio,Long Beach,0,,90804,0.0,2.0
1,14483613,4.007333,0,0,"{""Wireless Internet"",""Air conditioning"",Kitche...",4,2.0,4,2,False,...,2017-09-17,40.705468,-73.909439,"Beautiful and Simple Room W/2 Beds, 25 Mins to...",Ridgewood,38,86.0,11385,1.0,2.0
2,10412649,7.090077,1,1,"{TV,""Wireless Internet"",""Air conditioning"",Kit...",6,2.0,4,0,False,...,,38.917537,-77.031651,2br/2ba luxury condo perfect for infant / toddler,U Street Corridor,0,,20009,2.0,2.0
3,17954362,3.555348,0,0,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",1,1.0,4,0,True,...,2017-09-29,40.736001,-73.924248,Manhattan view from Queens. Lovely single room .,Sunnyside,19,96.0,11104,1.0,1.0
4,9969781,5.480639,0,1,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",4,1.0,4,1,True,...,2017-08-28,37.744896,-122.430665,Zen Captured Noe Valley House,Noe Valley,15,96.0,94131,2.0,2.0
5,13113872,6.214608,1,0,"{TV,Internet,""Wireless Internet"",""Air conditio...",2,1.0,4,1,True,...,,40.73858,-74.008752,Modern West Village Apartment,West Village,0,,10014,1.0,1.0
6,12032987,4.49981,2,0,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",2,1.0,4,2,True,...,2017-09-22,40.806629,-73.949911,Elegant Private Studio - Town House,Harlem,63,98.0,10027,1.0,1.0
7,12112830,4.394449,3,1,"{TV,""Cable TV"",""Wireless Internet"",""Air condit...",3,1.0,3,1,False,...,2017-09-25,40.752495,-73.811256,"Private Suite, Free Metrocard*",Flushing,213,93.0,11355,0.0,2.0
8,16738953,4.025352,1,0,"{Internet,""Wireless Internet"",Kitchen,""Pets al...",3,1.0,4,1,True,...,,34.050001,-118.421469,Spacious beautifully lit livingroom,Westside,0,,90064,1.0,1.0
9,94477,5.521461,1,1,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",4,1.0,4,1,True,...,2017-09-16,40.783344,-73.975775,Fabulous UWS 2 Bedroom Apartment!!,Upper West Side,77,96.0,10024,2.0,3.0


In [23]:
# Identifier les colonnes avec des strings
string_columns = airbnb.select_dtypes(include=['object']).columns
print("Colonnes avec des strings :", string_columns)

Colonnes avec des strings : Index(['property_type', 'room_type', 'amenities', 'bed_type',
       'cancellation_policy', 'city', 'description', 'first_review',
       'host_has_profile_pic', 'host_identity_verified', 'host_response_rate',
       'host_since', 'instant_bookable', 'last_review', 'name',
       'neighbourhood', 'zipcode'],
      dtype='object')


In [14]:
print("Valeurs uniques dans bed_type :", airbnb["bed_type"].unique())


Valeurs uniques dans bed_type : ['Real Bed' 'Pull-out Sofa' 'Futon' 'Airbed' 'Couch']


In [16]:
airbnb_train["amenities"].unique()

array(['{TV,"Wireless Internet",Kitchen,"Free parking on premises","Pets allowed","Suitable for events",Washer,Dryer,"Smoke detector",Essentials,Shampoo,"Lock on bedroom door",Hangers,"Hair dryer",Iron}',
       '{"Wireless Internet","Air conditioning",Kitchen,Heating,"Family/kid friendly","Smoke detector","Carbon monoxide detector","Fire extinguisher",Essentials,"Lock on bedroom door","24-hour check-in","Hair dryer",Iron,"translation missing: en.hosting_amenity_50","Self Check-In",Keypad,"Bed linens",Microwave,"Coffee maker",Refrigerator,"Dishes and silverware","Cooking basics",Oven,Stove,"Luggage dropoff allowed"}',
       '{TV,"Wireless Internet","Air conditioning",Kitchen,"Free parking on premises","Pets allowed","Elevator in building",Heating,"Family/kid friendly",Washer,Dryer,"Smoke detector","Carbon monoxide detector","First aid kit","Safety card","Fire extinguisher",Essentials,Shampoo,"Lock on bedroom door","Laptop friendly workspace"}',
       ...,
       '{TV,Internet,"Wirele

In [24]:
airbnb_train["city"].unique()

array(['LA', 'NYC', 'DC', 'SF', 'Chicago', 'Boston'], dtype=object)

In [29]:
airbnb_train["host_has_profile_pic"]

0        t
1        t
2        t
3        t
4        t
        ..
22229    t
22230    t
22231    t
22232    t
22233    t
Name: host_has_profile_pic, Length: 22234, dtype: object

In [27]:
# Identifier les colonnes avec des strings
string_columns = airbnb.select_dtypes(include=['object']).columns
print("Colonnes avec des strings :", string_columns)

# Afficher les valeurs uniques pour chaque colonne de type string
for col in string_columns:
    print(f"\nValeurs uniques dans la colonne '{col}':")
    print(airbnb[col].unique())

Colonnes avec des strings : Index(['property_type', 'room_type', 'amenities', 'bed_type',
       'cancellation_policy', 'city', 'description', 'first_review',
       'host_has_profile_pic', 'host_identity_verified', 'host_response_rate',
       'host_since', 'instant_bookable', 'last_review', 'name',
       'neighbourhood', 'zipcode'],
      dtype='object')

Valeurs uniques dans la colonne 'property_type':
[0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
 28 29 30]

Valeurs uniques dans la colonne 'room_type':
[0 1 2]

Valeurs uniques dans la colonne 'amenities':
['{TV,"Wireless Internet",Kitchen,"Free parking on premises","Pets allowed","Suitable for events",Washer,Dryer,"Smoke detector",Essentials,Shampoo,"Lock on bedroom door",Hangers,"Hair dryer",Iron}'
 '{"Wireless Internet","Air conditioning",Kitchen,Heating,"Family/kid friendly","Smoke detector","Carbon monoxide detector","Fire extinguisher",Essentials,"Lock on bedroom door","24-hour check-in","Hair dry

In [1]:
print("test")

test
