In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
import ast

In [11]:
# Step 1: Load the dataset
df_filtered = pd.read_csv('/content/balanced_data_with_sentiment 1.csv', on_bad_lines='skip')
print(df_filtered.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102750 entries, 0 to 102749
Columns: 119 entries, bathrooms to quarter
dtypes: float64(113), object(6)
memory usage: 93.3+ MB
None


In [12]:
# Step 2: Convert 'created' column to datetime format
df_filtered['created'] = pd.to_datetime(df_filtered['created'])

# Step 3: Extract useful components from the 'created' column
df_filtered['year'] = df_filtered['created'].dt.year
df_filtered['month'] = df_filtered['created'].dt.month
df_filtered['day'] = df_filtered['created'].dt.day
df_filtered['hour'] = df_filtered['created'].dt.hour
df_filtered['minute'] = df_filtered['created'].dt.minute
df_filtered['second'] = df_filtered['created'].dt.second

# Drop the original 'created' column as it's no longer needed
df_filtered.drop(columns=['created', 'manager_id', 'date'], inplace=True)

KeyError: "['manager_id'] not found in axis"

In [5]:
# Step 4: Process list columns ('features' and 'photos')
# Safely convert string representations of lists to actual lists
df_filtered['features'] = df_filtered['features'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)
df_filtered['photos'] = df_filtered['photos'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

# For 'features' column: Count the number of features in the list
df_filtered['features_count'] = df_filtered['features'].apply(
    lambda x: len(x) if isinstance(x, list) else 0
)

# For 'photos' column: Count the number of photos in the list
df_filtered['photos_count'] = df_filtered['photos'].apply(
    lambda x: len(x) if isinstance(x, list) else 0
)

# Print the counts to verify
print(df_filtered[['features_count', 'photos_count']].head())

# Drop original 'features' and 'photos' columns
df_filtered.drop(columns=['features', 'photos'], inplace=True)

   features_count  photos_count
0               3             6
1               6             7
2               8             5
3              12             7
4               3             3


In [6]:
# Step 5: Handle text columns like 'description' if present
if 'description' in df_filtered.columns:
    df_filtered['description'] = df_filtered['description'].fillna("")
    vectorizer = CountVectorizer(stop_words='english', max_features=100)
    description_matrix = vectorizer.fit_transform(df_filtered['description'])
    description_df = pd.DataFrame(description_matrix.toarray(), columns=vectorizer.get_feature_names_out())
    df_filtered = pd.concat([df_filtered, description_df], axis=1)
    df_filtered.drop(columns=['description'], inplace=True)
print(df_filtered.head())

   bathrooms  bedrooms                       building_id  \
0        1.0         3                                 0   
1        2.0         4  1f31e9001330dad4c202f92dc8ab8724   
2        1.0         1  5565db9b7cba3603834c4aa6f2950960   
3        2.0         2  2787598123c55dbf45b514958909c79c   
4        1.0         0  a06178cb6130c1cc3d7fde9d5cf6417f   

             display_address  latitude  listing_id  longitude  price  \
0  W 43rd Street and 9th Ave   40.7591     7196004   -73.9922   3325   
1                  E 66th St   40.7640     7099693   -73.9591   5500   
2                   Broadway   40.8198     7012584   -73.9578   1995   
3           West 42nd Street   40.7610     7173483   -73.9990   4600   
4           East 89th Street   40.7794     7130142   -73.9498   1999   

              street_address interest_level  ...  unit  view  viewing  views  \
0  W 43rd Street and 9th Ave           high  ...     0     0        0      0   
1              354 E 66th St           high  .

In [7]:
# Step 6: One-hot encode categorical columns
categorical_columns = ['building_id', 'display_address', 'street_address']
df_encoded = pd.get_dummies(df_filtered, columns=categorical_columns, drop_first=True)

# Step 7: Label encode the target column (interest_level)
label_encoder = LabelEncoder()
df_encoded['interest_level'] = label_encoder.fit_transform(df_encoded['interest_level'])

In [8]:
# Define the target column (ensure this matches your target column name)
target_column = 'interest_level'
# Step 8: Separate features (X) and target (y)
X = df_encoded.drop(columns=[target_column])
y = df_encoded[target_column]

In [9]:
# Step 9: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)

In [10]:
# Step 10: Train a decision tree classifier
clf = DecisionTreeClassifier(max_depth=2, random_state=5)
clf.fit(X_train, y_train)

# Step 11: Evaluate the model
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.4350851581508516


**Random Forest**

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [None]:
y_pred = rf_model.predict(X_test)
y_prob = rf_model.predict_proba(X_test)[:, 1]

In [None]:
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d')
plt.show()

In [None]:
# Manually select three sets of hyperparameters
param_sets = [
    {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 2},
    {'n_estimators': 200, 'max_depth': 20, 'min_samples_split': 5},
    {'n_estimators': 300, 'max_depth': 30, 'min_samples_split': 10},
]

# Store results for comparison
results = []

for i, params in enumerate(param_sets):
    print(f"Training model {i + 1} with parameters: {params}")
    # Train Random Forest with the selected hyperparameters
    rf_model = RandomForestClassifier(**params, random_state=42)
    rf_model.fit(X_train, y_train)

    # Evaluate on the test set
    y_test_pred = rf_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_test_pred)

    # Store results
    results.append({'params': params, 'accuracy': accuracy})

    # Print accuracy and confusion matrix
    print(f"Model {i + 1} Accuracy: {accuracy:.4f}")
    print("Classification Report:\n", classification_report(y_test, y_test_pred))

    # Plot confusion matrix
    cm = confusion_matrix(y_test, y_test_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f"Confusion Matrix for Model {i + 1}")
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.show()

# Compare results
print("\nComparison of Results:")
for result in results:
    print(f"Parameters: {result['params']}, Accuracy: {result['accuracy']:.4f}")

**Cross-validation**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# Define the first set of hyperparameters
params = {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 2}

# Number of folds for cross-validation
cv_folds = 5

print(f"Performing cross-validation with parameters: {params}")
# Initialize Random Forest with the selected hyperparameters
rf_model = RandomForestClassifier(**params, random_state=42)

# Perform cross-validation
scores = cross_val_score(rf_model, X_train, y_train, cv=cv_folds, scoring='accuracy')

# Calculate mean accuracy and standard deviation
mean_score = scores.mean()
std_dev = scores.std()

# Print cross-validation results
print(f"Mean Accuracy: {mean_score:.4f}, Std Dev: {std_dev:.4f}")