<a href="https://colab.research.google.com/github/cerfking/CS-6220-Data-Mining-Project/blob/main/Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install --force-reinstall "scikit-learn==1.0.2"

In [2]:
#import dependencies
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split

In [None]:

from sklearn.datasets import load_boston
boston = load_boston()

In [None]:
# Transform the data set into a data frame
df_x = pd.DataFrame(boston.data, columns=boston.feature_names)
#df_y = pd.DataFrame(boston.target)
df_y = pd.DataFrame(boston.target, columns=['MEDV'])

# Combining the features and target into a single DataFrame
df = pd.concat([df_x, df_y], axis=1)

df.head()


In [None]:
df_x.describe()

In [None]:
# Get some statistics from the data set

df.corr()

In [None]:
from ast import increment_lineno
# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use({'figure.figsize' : (15,10)})
plt.rcParams['axes.unicode_minus'] = False
df.hist(bins = 15)

In [None]:
#
sns.pairplot(df)

In [None]:
df.plot()

In [None]:
df.plot(subplots=True, figsize=(20,45))

In [None]:
df.plot(kind='hist', bins=15)

In [None]:
sns.boxplot(data=df[['INDUS','RM','RAD','LSTAT']])

In [None]:
# 绘制变量间相关性系数热力图
sns.heatmap(df.corr(), square=True, annot=True, cmap='YlGnBu')

In [None]:
for each in df.columns:
  plt.scatter(df[each],df['MEDV'])
  plt.title('{} and house price'.format(each))
  plt.xlabel(each)
  plt.ylabel('House Price')
  plt.yticks(range(0,60,5))
  plt.grid()
  plt.show()

In [None]:
# Initialize the linear regression model
reg = linear_model.LinearRegression()

In [6]:
# Split the data into 67% training and 33% testing data
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size= 0.33, random_state= 42)

In [None]:
# Train the model
reg.fit(x_train, y_train)

LinearRegression()

In [None]:
# Print the coeffecients for each feature
print(reg.coef_)

In [None]:
# Print the predictions on the test data
y_pred = reg.predict(x_test)
print(y_pred)

In [None]:
print(y_test)

In [None]:
# Check the model performance / accuracy using Mean Squared Error (MSE)
print(np.mean((y_pred - y_test)**2))

In [None]:
# Check the model performance / accuracy using Mean Squared Error (MSE) and sklearn.metrics
from sklearn.metrics import mean_squared_error
print(mean_squared_error(y_test, y_pred))

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [5, 10, 20, 50, 100, 200], #The number of the estimators
    'max_depth': [3, 5, 7],
    'max_features': [0.6, 0.7, 0.8, 1]
}

rf =  RandomForestRegressor()

# Construct a search regressor based on the random forest regressor
grid = GridSearchCV(rf, param_grid=param_grid, cv=3)

grid.fit(x_train, y_train)
# View the parameters of the best performing model
grid.best_params_
# Specify the model as the model corresponding to the parameters with the best performance
rf_reg = grid.best_estimator_
rf_reg

In [None]:
# Visualization
from sklearn import tree
import pydotplus
from IPython.display import Image, display

estimator = rf_reg.estimators_[3]
dot_data = tree.export_graphviz(estimator, out_file=None, filled=True, rounded=True)
graph = pydotplus.graph_from_dot_data(dot_data)
display(Image(graph.create_png()))

In [None]:
# Feature ranking
feature_names = df_x.columns
feature_importances = rf_reg.feature_importances_
indices = np.argsort(feature_importances)[::-1]
for index in indices:
  print("feature %s (%f)" %(feature_names[index], feature_importances[index]))

In [None]:
# 随机森林模型不同特征的重要度
plt.figure(figsize=(16,8))
plt.bar(range(len(feature_importances)), feature_importances[indices], color='b')
plt.xticks(range(len(feature_importances)), np.array(feature_names)[indices], color='b')
plt.show()

In [None]:
# 可视化测试集上的回归预测结果
import matplotlib.pyplot as plt
if isinstance(y_test, np.ndarray):
    y_test_1d = y_test.ravel()  # For numpy arrays
elif isinstance(y_test, (pd.Series, pd.DataFrame)):
    y_test_1d = y_test.squeeze()  # For pandas objects

# Ensure predictions are 1-dimensional
predictions = rf_reg.predict(x_test)
if predictions.ndim > 1:  # For numpy arrays with more than one dimension
    predictions_1d = predictions.ravel()
else:  # If predictions are already 1-dimensional
    predictions_1d = predictions

# Now, create the DataFrame with these 1D data
result = pd.DataFrame({
    "label": y_test_1d,
    "prediction": predictions_1d
})
result.head()
result['label'].plot(style='k.', figsize=(15,5))
result['prediction'].plot(style='r.')
plt.legend(fontsize=15,markerscale=3)
plt.tick_params(labelsize=25)
plt.grid()

In [None]:
from sklearn import metrics
MSE = metrics.mean_squared_error(y_test, rf_reg.predict(x_test))
MSE