In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as pl
import random
import seaborn as sns #visualisation
np.random.seed(42)
random.seed(42)
import warnings
warnings.filterwarnings("ignore")

In [None]:
df1= pd.read_csv("Rail freight transport.csv")

In [None]:
df1.head()

In [None]:
df1.info()

In [None]:
df1.shape

In [None]:
df1 = pd.DataFrame(df1)
column_names = df1.columns
print(column_names)

In [None]:
df1

In [None]:
df1 = df1[['Country','Year','Value']]
df1.rename(columns = {'Value':'Total rail freight transport(M)'},inplace = True)
df1

In [None]:
df1

In [None]:
# 创建完整的年份序列
full_years = list(range(1981, 2023))


# 按国家和年份分组并计算每个组的数据点数量
grouped_data = df1.groupby(['Country', 'Year']).size().reset_index(name='Count')

# 遍历每个国家，比对年份序列并找到缺失年份
for country in df1['Country'].unique():
    country_data = grouped_data[grouped_data['Country'] == country]
    missing_years = [year for year in full_years if year not in country_data['Year'].values]
    if missing_years:
        print(f"在 {country} 中缺失的年份：{missing_years}")


In [None]:
import pmdarima
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from pmdarima.arima import auto_arima
from sklearn.metrics import mean_squared_error, mean_absolute_error

# 加载时间序列数据
df = df1.copy()
df.set_index("Year",inplace = True)
df

selected_country = 'Japan'
selected_data = df[df['Country'] == selected_country]['Total rail freight transport(M)']
selected_data

# 使用auto_arima自动选择最佳ARIMA模型阶数
stepwise_model = auto_arima(selected_data.loc[:'2018'], start_p=0, start_q=0,
                        max_p=3, max_q=3, m=12,
                        start_P=0, seasonal=True,
                        d=1, D=1, trace=True,
                        error_action='ignore',  
                        suppress_warnings=True, 
                        stepwise=True)

print(stepwise_model.summary())

# 获取最佳阶数
best_order = (stepwise_model.order[0], stepwise_model.order[1], stepwise_model.order[2])

# 使用最佳ARIMA模型进行训练
model = sm.tsa.ARIMA(selected_data.loc[:'2019'], order=best_order)
results = model.fit()

# 预测2020、2021、2022年的数据
forecast_steps = 3
forecast = results.get_forecast(steps=forecast_steps)

# 提取预测结果和置信区间
forecast_mean = forecast.predicted_mean

# 可视化原始数据、模型拟合和预测结果
plt.figure(figsize=(12, 6))
plt.plot(selected_data, label='Observed')
plt.plot(forecast_mean, label='Forecast', color='red')
plt.fill_between(forecast_mean.index, forecast.conf_int().iloc[:, 0], forecast.conf_int().iloc[:, 1], color='pink', alpha=0.2)
plt.xlabel('Year')
plt.ylabel('GDP_per_head(USD)')
plt.legend()
plt.show()

# 打印预测结果
print("预测结果:")
print(forecast_mean)

# 计算多种预测误差指标
actual_values = selected_data.loc['2019':'2021']

# 均方根误差 (RMSE)
rmse = np.sqrt(mean_squared_error(actual_values, forecast_mean))
print(f'RMSE: {rmse}')


# 均方误差 (MSE)
mse = mean_squared_error(actual_values, forecast_mean)
print(f'MSE: {mse}')

# R平方 (R-squared)
mean_actual = np.mean(actual_values)
sst = np.sum((actual_values - mean_actual) ** 2)
ssr = np.sum((forecast_mean - actual_values) ** 2)
rsquared = 1 - (ssr / sst)
print(f'R-squared: {rsquared}')


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from pmdarima.arima import auto_arima
from sklearn.metrics import mean_squared_error, mean_absolute_error

class TimeSeriesForecaster:
    def __init__(self, data):
        self.data = data

    def forecast_data(self, selected_column, time_column, train_start_year, train_end_year, forecast_steps, selected_city):
        # 将输入的年份参数转换为整数类型
        train_start_year = int(train_start_year)
        train_end_year = int(train_end_year)

        # 提取选定城市的数据
        selected_data = self.data[(self.data['Country'] == selected_city) & (self.data[time_column].astype(int) >= train_start_year) & (self.data[time_column].astype(int) <= train_end_year)][[time_column, selected_column]]

        # 将时间列设置为索引
        selected_data.set_index(time_column, inplace=True)

        # 使用auto_arima自动选择最佳ARIMA模型阶数
        stepwise_model = auto_arima(selected_data[selected_column], start_p=0, start_q=0,
                                    max_p=3, max_q=3, m=12,
                                    start_P=0, seasonal=True,
                                    d=1, D=1, trace=True,
                                    error_action='ignore',
                                    suppress_warnings=True,
                                    stepwise=True)

        print(stepwise_model.summary())

        # 获取最佳阶数
        best_order = (stepwise_model.order[0], stepwise_model.order[1], stepwise_model.order[2])

        # 使用最佳ARIMA模型进行训练
        model = sm.tsa.ARIMA(selected_data[selected_column], order=best_order)
        results = model.fit()

        # 预测指定步数的数据
        forecast = results.get_forecast(steps=forecast_steps)

        # 提取预测结果
        forecast_mean = forecast.predicted_mean

        return forecast_mean

# 创建一个TimeSeriesForecaster对象，并将DataFrame传入
forecaster = TimeSeriesForecaster(df1)

# 指定列、时间列、训练集时间区间、预测步数和城市
selected_column = 'Total rail freight transport(M)'
time_column = 'Year'
train_start_year = '1981'
train_end_year = '2021'
forecast_steps = 1
selected_city = 'Japan'

# 获取预测结果
forecast_result = forecaster.forecast_data(selected_column, time_column, train_start_year, train_end_year, forecast_steps, selected_city)

# 打印预测结果
print(f"预测结果:\n{forecast_result}")

# 进行其他误差指标的计算和可视化等操作


In [None]:
new_data = {
    'Country': ["Japan"],
    'Year': [2022],
    'Total rail freight transport(M)': [18492.181041]
}

In [None]:
df1 = pd.concat([df1, pd.DataFrame(new_data)], ignore_index=True)

In [None]:
df1

In [None]:
df1 = df1[(df1['Year'] >= 1995) & (df1['Year'] <= 2022)]
df1.reset_index(drop = True,inplace = True)


In [None]:
df1

In [None]:
df2 = pd.read_csv("Rail passenger transport.csv")

In [None]:
df2.head()

In [None]:
df2.info()

In [None]:
df2.shape

In [None]:
df2 = df2[["Country","Year","Value"]]
df2.rename(columns = {"Value":"Rail passenger transport(M)"},inplace = True)
df2

In [None]:
# 创建完整的年份序列
full_years = list(range(1981, 2023))


# 按国家和年份分组并计算每个组的数据点数量
grouped_data = df2.groupby(['Country', 'Year']).size().reset_index(name='Count')

# 遍历每个国家，比对年份序列并找到缺失年份
for country in df2['Country'].unique():
    country_data = grouped_data[grouped_data['Country'] == country]
    missing_years = [year for year in full_years if year not in country_data['Year'].values]
    if missing_years:
        print(f"在 {country} 中缺失的年份：{missing_years}")

In [None]:
# 创建一个TimeSeriesForecaster对象，并将DataFrame传入
forecaster = TimeSeriesForecaster(df2)

# 指定列、时间列、训练集时间区间、预测步数和城市
selected_column = 'Rail passenger transport(M)'
time_column = 'Year'
train_start_year = '1981'
train_end_year = '2021'
forecast_steps = 1
selected_city = 'Japan'

# 获取预测结果
forecast_result = forecaster.forecast_data(selected_column, time_column, train_start_year, train_end_year, forecast_steps, selected_city)

# 打印预测结果
print(f"预测结果:\n{forecast_result}")

# 进行其他误差指标的计算和可视化等操作

In [None]:
new_data = {
    'Country': ["Japan"],
    'Year': [2022],
    'Rail passenger transport(M)': [18492.181041]
}

In [None]:
df2 = pd.concat([df2, pd.DataFrame(new_data)], ignore_index=True)

In [None]:
df2

In [None]:
df2 = df2[(df2['Year'] >= 1995) & (df2['Year'] <= 2022)]
df2.reset_index(drop = True,inplace = True)


In [None]:
df2

In [None]:
df3 = pd.read_csv("Rail infrastructure investment.csv")

In [None]:
df3.head()

In [None]:
df3.info()

In [None]:
df3.shape

In [None]:
df3 = df3[["Country","Year","Value"]]
df3.rename(columns = {"Value":"Rail infrastructure investment(Euro)"},inplace = True)
df3

In [None]:
# 创建完整的年份序列
full_years = list(range(1995, 2023))


# 按国家和年份分组并计算每个组的数据点数量
grouped_data = df3.groupby(['Country', 'Year']).size().reset_index(name='Count')

# 遍历每个国家，比对年份序列并找到缺失年份
for country in df3['Country'].unique():
    country_data = grouped_data[grouped_data['Country'] == country]
    missing_years = [year for year in full_years if year not in country_data['Year'].values]
    if missing_years:
        print(f"在 {country} 中缺失的年份：{missing_years}")

In [None]:
# 创建一个TimeSeriesForecaster对象，并将DataFrame传入
forecaster = TimeSeriesForecaster(df3)

# 指定列、时间列、训练集时间区间、预测步数和城市
selected_column = 'Rail infrastructure investment(Euro)'
time_column = 'Year'
train_start_year = '1995'
train_end_year = '2021'
forecast_steps = 1
selected_city = 'Finland'

# 获取预测结果
forecast_result = forecaster.forecast_data(selected_column, time_column, train_start_year, train_end_year, forecast_steps, selected_city)

# 打印预测结果
print(f"预测结果:\n{forecast_result}")


In [None]:
selected_column = 'Rail infrastructure investment(Euro)'
time_column = 'Year'
train_start_year = '1995'
train_end_year = '2020'
forecast_steps = 2
selected_city = 'Ireland'

# 获取预测结果
forecast_result = forecaster.forecast_data(selected_column, time_column, train_start_year, train_end_year, forecast_steps, selected_city)

# 打印预测结果
print(f"预测结果:\n{forecast_result}")


In [None]:
selected_column = 'Rail infrastructure investment(Euro)'
time_column = 'Year'
train_start_year = '1995'
train_end_year = '2020'
forecast_steps = 2
selected_city = 'Japan'

# 获取预测结果
forecast_result = forecaster.forecast_data(selected_column, time_column, train_start_year, train_end_year, forecast_steps, selected_city)

# 打印预测结果
print(f"预测结果:\n{forecast_result}")

In [None]:
selected_column = 'Rail infrastructure investment(Euro)'
time_column = 'Year'
train_start_year = '1995'
train_end_year = '2020'
forecast_steps = 2
selected_city = 'Luxembourg'

# 获取预测结果
forecast_result = forecaster.forecast_data(selected_column, time_column, train_start_year, train_end_year, forecast_steps, selected_city)

# 打印预测结果
print(f"预测结果:\n{forecast_result}")

In [None]:
new_data = {
    'Country': ['Finland',"Ireland","Ireland","Japan","Japan","Luxembourg","Luxembourg"],
    'Year': [2022,2021,2022,2021,2022,2021,2022],
    'Rail infrastructure investment(Euro)': [606000000.0,1.695783e+08,1.695783e+08,1.701974e+10,1.701974e+10,2.650807e+08,2.624219e+08]
}

In [None]:
df3 = pd.concat([df3, pd.DataFrame(new_data)], ignore_index=True)

In [None]:
df3

In [None]:
sheet1 = pd.read_excel("WPP2022_GEN_F01_DEMOGRAPHIC_INDICATORS_COMPACT_REV1.xlsx",sheet_name = 0)

In [None]:
sheet1.shape

In [None]:
sheet1.head()

In [None]:
for index, row in sheet1.head(20).iterrows():
    print(f"number of column: {index + 1}, data: {list(row)}")

In [None]:
columns = list(sheet1.iloc[15])
columns

In [None]:
my_list = columns

target_values = ['Region, subregion, country or area *', 'Year', 'Total Population, as of 1 January (thousands)']

# 要查找的值


# 初始化一个字典来存储每个值的索引
value_indices = {}

# 遍历目标值，查找它们的索引
for value in target_values:
    try:
        index = my_list.index(value)
        value_indices[value] = index
    except ValueError:
        # 如果值不在列表中，处理异常
        value_indices[value] = None

# 输出每个值的索引
for value, index in value_indices.items():
    if index is not None:
        print(f"值 '{value}' 的索引是 {index}")
    else:
        print(f"值 '{value}' 未找到")


matching_rows = sheet1[sheet1.iloc[:, 2] == "Ireland"]
print(matching_rows)

column_3_data = list(matching_rows.iloc[:, 2])
column_11_data = list(matching_rows.iloc[:, 10])
column_12_data = list(matching_rows.iloc[:, 11])

print("第3列数据:", column_3_data)
print("第11列数据:", column_11_data)
print("第12列数据:", column_12_data)



Ireland_population  = pd.DataFrame(
           {   "Year" : column_11_data,
               "Ireland_population(thousands)" : column_12_data
               
           }
                                  )

Ireland_population

In [None]:
my_list = columns

target_values = ['Region, subregion, country or area *', 'Year', 'Total Population, as of 1 January (thousands)']

# 要查找的值


# 初始化一个字典来存储每个值的索引
value_indices = {}

# 遍历目标值，查找它们的索引
for value in target_values:
    try:
        index = my_list.index(value)
        value_indices[value] = index
    except ValueError:
        # 如果值不在列表中，处理异常
        value_indices[value] = None

# 输出每个值的索引
for value, index in value_indices.items():
    if index is not None:
        print(f"值 '{value}' 的索引是 {index}")
    else:
        print(f"值 '{value}' 未找到")


In [None]:
import pandas as pd

class SearchPopulationData:
    def __init__(self, sheet, country_name):
        self.sheet = sheet
        self.country_name = country_name

    def filter_data_by_country(self):
        matching_rows = self.sheet[self.sheet.iloc[:, 2] == self.country_name]
    
        column_11_data = list(matching_rows.iloc[:, 10])
        column_12_data = list(matching_rows.iloc[:, 11])

        return  column_11_data, column_12_data

    def create_population_dataframe(self):
        column_11_data, column_12_data = self.filter_data_by_country()

        population = pd.DataFrame(
            {
                "Year": column_11_data,
                f"{self.country_name}_population(thousands)": column_12_data
            }
        )

        return population

In [None]:
country_name = "Finland"

Finland_population_data1 = SearchPopulationData(sheet1, country_name).create_population_dataframe()
print(f"{country_name} Population Data1:")
print(Finland_population_data1)

country_name = "Ireland"

Ireland_population_data1 = SearchPopulationData(sheet1, country_name).create_population_dataframe()
print(f"{country_name} Population Data1:")
print(Ireland_population_data1)

country_name = "Japan"

Japan_population_data1 = SearchPopulationData(sheet1, country_name).create_population_dataframe()
print(f"{country_name} Population Data1:")
print(Japan_population_data1)

country_name = "Luxembourg"

Luxembourg_population_data1 = SearchPopulationData(sheet1, country_name).create_population_dataframe()
print(f"{country_name} Population Data1:")
print(Luxembourg_population_data1)

In [None]:
sheet2= pd.read_excel("WPP2022_GEN_F01_DEMOGRAPHIC_INDICATORS_COMPACT_REV1.xlsx",sheet_name = 1)

In [None]:
sheet2.shape

In [None]:
sheet2.head()

In [None]:
for index, row in sheet1.head(20).iterrows():
    print(f"number of column: {index + 1}, data: {list(row)}")
    

In [None]:
country_name = "Finland"

Finland_population_data2 = SearchPopulationData(sheet2, country_name).create_population_dataframe()
print(f"{country_name} Population Data2:")
print(Finland_population_data2)

country_name = "Ireland"

Ireland_population_data2 = SearchPopulationData(sheet2, country_name).create_population_dataframe()
print(f"{country_name} Population Data2:")
print(Ireland_population_data2)

country_name = "Japan"

Japan_population_data2 = SearchPopulationData(sheet2, country_name).create_population_dataframe()
print(f"{country_name} Population Data2:")
print(Japan_population_data2)

country_name = "Luxembourg"

Luxembourg_population_data2 = SearchPopulationData(sheet2, country_name).create_population_dataframe()
print(f"{country_name} Population Data2:")
print(Luxembourg_population_data2)

In [None]:
Finland_population_data= pd.concat([Finland_population_data1,Finland_population_data2],axis = 0)

Finland_population_data.reset_index(drop = True,inplace = True)

Finland_population_data["Country"] = "Finland"

index_value1 = Finland_population_data[Finland_population_data['Year'] == 1995].index.tolist()
index_value2 = Finland_population_data[Finland_population_data['Year'] == 2022].index.tolist()

Finland_population_data

Finland_population_data = Finland_population_data.iloc[index_value1[0]:index_value2[0]+1]

Finland_population_data.rename(columns = {"Finland_population(thousands)":"population(thousands)"},inplace = True)

In [None]:
Ireland_population_data= pd.concat([Ireland_population_data1,Ireland_population_data2],axis = 0)

Ireland_population_data.reset_index(drop = True,inplace = True)

Ireland_population_data["Country"] = "Ireland"

index_value1 = Ireland_population_data[Ireland_population_data['Year'] == 1995].index.tolist()
index_value2 = Ireland_population_data[Ireland_population_data['Year'] == 2022].index.tolist()


Ireland_population_data = Ireland_population_data.iloc[index_value1[0]:index_value2[0]+1]

Ireland_population_data.rename(columns = {"Ireland_population(thousands)":"population(thousands)"},inplace = True)

In [None]:
Japan_population_data= pd.concat([Japan_population_data1,Japan_population_data2],axis = 0)

Japan_population_data.reset_index(drop = True,inplace = True)

Japan_population_data["Country"] = "Japan"

index_value1 = Japan_population_data[Japan_population_data['Year'] == 1995].index.tolist()
index_value2 = Japan_population_data[Japan_population_data['Year'] == 2022].index.tolist()


Japan_population_data = Japan_population_data.iloc[index_value1[0]:index_value2[0]+1]

Japan_population_data.rename(columns = {"Japan_population(thousands)":"population(thousands)"},inplace = True)

In [None]:
Japan_population_data

In [None]:
Luxembourg_population_data= pd.concat([Luxembourg_population_data1,Luxembourg_population_data2],axis = 0)

Luxembourg_population_data.reset_index(drop = True,inplace = True)

Luxembourg_population_data["Country"] = "Luxembourg"

index_value1 = Luxembourg_population_data[Luxembourg_population_data['Year'] == 1995].index.tolist()
index_value2 = Luxembourg_population_data[Luxembourg_population_data['Year'] == 2022].index.tolist()


Luxembourg_population_data = Luxembourg_population_data.iloc[index_value1[0]:index_value2[0]+1]

Luxembourg_population_data.rename(columns = {"Luxembourg_population(thousands)":"population(thousands)"},inplace = True)

In [None]:
Luxembourg_population_data

In [None]:
df5 = pd.concat([Finland_population_data,Ireland_population_data,Japan_population_data,Luxembourg_population_data])

In [None]:
df5.reset_index(drop = True,inplace = True )

In [None]:
df5

In [None]:
GDP_per_head= pd.read_csv("GDP per head of population.csv")

In [None]:
GDP_per_head.head()

In [None]:
GDP_per_head.info()

In [None]:
GDP_per_head.shape

In [None]:
GDP_per_head = GDP_per_head[["Country","TIME","Value"]]
GDP_per_head.rename(columns = {"Value":"GDP_per_head(USD)","TIME" :"Year" },inplace = True)
GDP_per_head

In [None]:
df6 = GDP_per_head

In [None]:
dataframes = [df1,df2,df3,df5,df6]

from functools import reduce
combined_df = reduce(lambda left, right: pd.merge(left, right, on=['Year', 'Country'], how='outer'), dataframes)

In [None]:
combined_df

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats

class DataAnalyzer:
    def __init__(self, data, column_name):
        self.data = data
        self.column_name = column_name
        self.columns_to_analyze = self.data.columns[2:]
        self.countries = self.data['Country'].unique()

    def analyze_column(self):
        plt.figure(figsize=(12, 5))

        # 绘制QQ-Plot
        for i in range(0, len(self.countries), 4):
            part_countries = self.countries[i:i + 4]

            for j, country in enumerate(part_countries):
                plt.subplot(1, 4, j + 1)

                country_data = self.data[self.data['Country'] == country]
                stats.probplot(country_data[self.column_name], dist="norm", plot=plt, rvalue=True, fit=True)
                plt.title(f'{country}')

            plt.tight_layout()

        # 添加间隔
        plt.subplots_adjust(wspace=0.5)
        plt.show()

        # 进行Shapiro-Wilk检验
        for i in range(0, len(self.countries), 4):
            part_countries = self.countries[i:i + 4]

            for country in part_countries:
                country_data = self.data[self.data['Country'] == country][self.column_name]
                stat, p = stats.shapiro(country_data)
                alpha = 0.05
                if p > alpha:
                    shapiro_result = 'Follows Normal Distribution'
                else:
                    shapiro_result = 'Does Not Follow Normal Distribution'

                print(f'{self.column_name} in {country} - Shapiro-Wilk Test: {shapiro_result} (p-value: {p:.4f})')

data =  combined_df  # 请替换成你的数据文件路径

In [None]:

analyzer = DataAnalyzer(data, "Total rail freight transport(M)")  # 请替换成你要分析的列名
analyzer.analyze_column()

In [None]:
analyzer = DataAnalyzer(data, "Rail passenger transport(M)")  # 请替换成你要分析的列名
analyzer.analyze_column()

In [None]:
analyzer = DataAnalyzer(data, "Rail infrastructure investment(Euro)")  # 请替换成你要分析的列名
analyzer.analyze_column()

In [None]:
analyzer = DataAnalyzer(data, "population(thousands)")  # 请替换成你要分析的列名
analyzer.analyze_column()

In [None]:
analyzer = DataAnalyzer(data, "GDP_per_head(USD)")  # 请替换成你要分析的列名
analyzer.analyze_column()

In [None]:
import pandas as pd
from scipy.stats import levene

combined_df['Rail passenger transport per Capita'] = combined_df['Rail passenger transport(M)'] / combined_df['population(thousands)']
# 选择三个国家的数据
luxembourg_data = data[data['Country'] == 'Luxembourg']['Rail passenger transport per Capita']
ireland_data = data[data['Country'] == 'Ireland']['Rail passenger transport per Capita']
finland_data = data[data['Country'] == 'Finland']['Rail passenger transport per Capita']


# 执行Levene's Test
statistic, p_value = levene(luxembourg_data, ireland_data, finland_data)

# 打印结果
print(f'Levene\'s Test Statistic: {statistic:.4f}')
print(f'p-value: {p_value:.4f}')

# 解释结果
alpha = 0.05  # 显著性水平
if p_value < alpha:
    print('Reject the null hypothesis: The variances are not equal.')
else:
    print('Fail to reject the null hypothesis: The variances are equal.')


In [None]:
combined_df['Rail passenger transport per Capita']

In [None]:
import pandas as pd
from scipy.stats import f_oneway

# 假设您已经计算了人均 Rail passenger transport(M) 并重命名为 'Rail passenger transport per Capita'
# 请确保 'Rail passenger transport per Capita' 列已经存在于 combined_df 中

# 选择三个国家的数据
luxembourg_data = combined_df[combined_df['Country'] == 'Luxembourg']['Rail passenger transport per Capita']
ireland_data = combined_df[combined_df['Country'] == 'Ireland']['Rail passenger transport per Capita']
finland_data = combined_df[combined_df['Country'] == 'Finland']['Rail passenger transport per Capita']

# 执行单因素方差分析 (One-way ANOVA)
f_statistic, p_value = f_oneway(luxembourg_data, ireland_data, finland_data)

# 打印结果
print(f'F-statistic: {f_statistic:.4f}')
print(f'p-value: {p_value:.4f}')

# 解释结果
alpha = 0.05  # 显著性水平
if p_value < alpha:
    print('Reject the null hypothesis: There is a significant difference among the groups.')
else:
    print('Fail to reject the null hypothesis: There is no significant difference among the groups.')


In [None]:
import pandas as pd
from scipy.stats import ttest_ind

# 假设您已经计算了人均 Rail passenger transport(M) 并重命名为 'Rail passenger transport per Capita'
# 请确保 'Rail passenger transport per Capita' 列已经存在于 combined_df 中

# 选择三个国家的数据
luxembourg_data = combined_df[combined_df['Country'] == 'Luxembourg']['Rail passenger transport per Capita']
ireland_data = combined_df[combined_df['Country'] == 'Ireland']['Rail passenger transport per Capita']
finland_data = combined_df[combined_df['Country'] == 'Finland']['Rail passenger transport per Capita']

# 执行两两独立样本t检验
t_statistic1, p_value1 = ttest_ind(luxembourg_data, ireland_data)
t_statistic2, p_value2 = ttest_ind(luxembourg_data, finland_data)
t_statistic3, p_value3 = ttest_ind(ireland_data, finland_data)

# 打印结果
print(f'Luxembourg vs. Ireland - t-statistic: {t_statistic1:.4f}, p-value: {p_value1:.4f}')
print(f'Luxembourg vs. Finland - t-statistic: {t_statistic2:.4f}, p-value: {p_value2:.4f}')
print(f'Ireland vs. Finland - t-statistic: {t_statistic3:.4f}, p-value: {p_value3:.4f}')

# 解释结果
alpha = 0.05  # 显著性水平
if p_value1 < alpha:
    print('Luxembourg vs. Ireland: Reject the null hypothesis - There is a significant difference.')
else:
    print('Luxembourg vs. Ireland: Fail to reject the null hypothesis - There is no significant difference.')

if p_value2 < alpha:
    print('Luxembourg vs. Finland: Reject the null hypothesis - There is a significant difference.')
else:
    print('Luxembourg vs. Finland: Fail to reject the null hypothesis - There is no significant difference.')

if p_value3 < alpha:
    print('Ireland vs. Finland: Reject the null hypothesis - There is a significant difference.')
else:
    print('Ireland vs. Finland: Fail to reject the null hypothesis - There is no significant difference.')


In [None]:
combined_df['Rail infrastructure investment per Capita (Euro)'] = combined_df['Rail infrastructure investment(Euro)'] / combined_df['population(thousands)']


In [None]:
from scipy.stats import kruskal

# 选择四个国家的数据
luxembourg_data = combined_df[combined_df['Country'] == 'Luxembourg']['Rail infrastructure investment per Capita (Euro)']
japan_data = combined_df[combined_df['Country'] == 'Japan']['Rail infrastructure investment per Capita (Euro)']
ireland_data = combined_df[combined_df['Country'] == 'Ireland']['Rail infrastructure investment per Capita (Euro)']
finland_data = combined_df[combined_df['Country'] == 'Finland']['Rail infrastructure investment per Capita (Euro)']

# 执行Kruskal-Wallis检验
statistic, p_value = kruskal(luxembourg_data, japan_data, ireland_data, finland_data)

# 打印结果
print(f'Kruskal-Wallis Test Statistic: {statistic:.4f}')
print(f'p-value: {p_value:.4f}')

# 解释结果
alpha = 0.05  # 显著性水平
if p_value < alpha:
    print('Reject the null hypothesis: There is a significant difference among the groups.')
else:
    print('Fail to reject the null hypothesis: There is no significant difference among the groups.')


In [None]:
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt

# 选择要比较的两个国家
country1 = 'Ireland'
country2 = 'Luxembourg'

# 获取两个国家的数据
data_country1 = combined_df[combined_df['Country'] == country1]['Rail infrastructure investment per Capita (Euro)']
data_country2 = combined_df[combined_df['Country'] == country2]['Rail infrastructure investment per Capita (Euro)']

# 执行 Mann-Whitney U 检验
statistic1, p_value1 = stats.mannwhitneyu(data_country1, data_country2, alternative='two-sided')

# 创建可视化图
plt.figure(figsize=(10, 5))
plt.boxplot([data_country1, data_country2], labels=[country1, country2])
plt.title(f'Mann-Whitney U Test: {country1} vs. {country2}')
plt.ylabel('Rail infrastructure investment per Capita (Euro)')
plt.grid(True)

# 打印检验结果
print(f'Mann-Whitney U Test ({country1} vs. {country2}):')
print(f'Statistic: {statistic1:.4f}')
print(f'p-value: {p_value1:.4f}')

# 解释结果
alpha = 0.05
if p_value1 < alpha:
    print(f'Reject the null hypothesis: The medians of {country1} and {country2} are significantly different.')
else:
    print(f'Fail to reject the null hypothesis: There is no significant difference in the medians of {country1} and {country2}.')

# 显示可视化图
plt.show()


In [None]:
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt

# 选择要比较的两个国家
country1 = 'Ireland'
country2 = 'Japan'

# 获取两个国家的数据
data_country1 = combined_df[combined_df['Country'] == country1]['Rail infrastructure investment per Capita (Euro)']
data_country2 = combined_df[combined_df['Country'] == country2]['Rail infrastructure investment per Capita (Euro)']

# 执行 Mann-Whitney U 检验
statistic1, p_value1 = stats.mannwhitneyu(data_country1, data_country2, alternative='two-sided')

# 创建可视化图
plt.figure(figsize=(10, 5))
plt.boxplot([data_country1, data_country2], labels=[country1, country2])
plt.title(f'Mann-Whitney U Test: {country1} vs. {country2}')
plt.ylabel('Rail infrastructure investment per Capita (Euro)')
plt.grid(True)

# 打印检验结果
print(f'Mann-Whitney U Test ({country1} vs. {country2}):')
print(f'Statistic: {statistic1:.4f}')
print(f'p-value: {p_value1:.4f}')

# 解释结果
alpha = 0.05
if p_value1 < alpha:
    print(f'Reject the null hypothesis: The medians of {country1} and {country2} are significantly different.')
else:
    print(f'Fail to reject the null hypothesis: There is no significant difference in the medians of {country1} and {country2}.')

# 显示可视化图
plt.show()

In [None]:
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt

# 选择要比较的两个国家
country1 = 'Ireland'
country2 = 'Finland'

# 获取两个国家的数据
data_country1 = combined_df[combined_df['Country'] == country1]['Rail infrastructure investment per Capita (Euro)']
data_country2 = combined_df[combined_df['Country'] == country2]['Rail infrastructure investment per Capita (Euro)']

# 执行 Mann-Whitney U 检验
statistic1, p_value1 = stats.mannwhitneyu(data_country1, data_country2, alternative='two-sided')

# 创建可视化图
plt.figure(figsize=(10, 5))
plt.boxplot([data_country1, data_country2], labels=[country1, country2])
plt.title(f'Mann-Whitney U Test: {country1} vs. {country2}')
plt.ylabel('Rail infrastructure investment per Capita (Euro)')
plt.grid(True)

# 打印检验结果
print(f'Mann-Whitney U Test ({country1} vs. {country2}):')
print(f'Statistic: {statistic1:.4f}')
print(f'p-value: {p_value1:.4f}')

# 解释结果
alpha = 0.05
if p_value1 < alpha:
    print(f'Reject the null hypothesis: The medians of {country1} and {country2} are significantly different.')
else:
    print(f'Fail to reject the null hypothesis: There is no significant difference in the medians of {country1} and {country2}.')

# 显示可视化图
plt.show()

In [None]:
combined_df.drop(columns = ["Rail passenger transport per Capita","Rail infrastructure investment per Capita (Euro)"],inplace = True)

In [None]:
combined_df

In [None]:
ireland_data = combined_df[combined_df['Country'] == 'Ireland']

# 计算相关性矩阵
ireland_data = ireland_data.drop(columns=['Country'])

correlation_matrix_ireland = ireland_data.corr()
# 创建热力图
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix_ireland, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Heatmap - Ireland')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
from sklearn.preprocessing import MinMaxScaler
# Select the data for Ireland
ireland_data = combined_df[combined_df['Country'] == 'Ireland']

# Select the data for Luxembourg
luxembourg_data = combined_df[combined_df['Country'] == 'Luxembourg']

# Exclude 'Country' and the target variable
excluded_columns = ['Country', 'Total rail freight transport(M)']

# Extract features and target variable for Ireland
ireland_features = [col for col in ireland_data.columns if col not in excluded_columns]
ireland_X = ireland_data[ireland_features]
ireland_y = ireland_data['Total rail freight transport(M)']

# Extract features and target variable for Luxembourg
luxembourg_features = [col for col in luxembourg_data.columns if col not in excluded_columns]
luxembourg_X = luxembourg_data[luxembourg_features]
luxembourg_y = luxembourg_data['Total rail freight transport(M)']

# Split the dataset into training and testing sets for Ireland
ireland_X_train, ireland_X_test, ireland_y_train, ireland_y_test = train_test_split(
    ireland_X, ireland_y, test_size=0.2, random_state=42)

# Split the dataset into training and testing sets for Luxembourg
luxembourg_X_train, luxembourg_X_test, luxembourg_y_train, luxembourg_y_test = train_test_split(
    luxembourg_X, luxembourg_y, test_size=0.2, random_state=42)

# Create a MinMaxScaler instance
scaler = MinMaxScaler()

# Scale the features for Ireland
ireland_X_train_scaled = scaler.fit_transform(ireland_X_train)
ireland_X_test_scaled = scaler.transform(ireland_X_test)

# Scale the features for Luxembourg
luxembourg_X_train_scaled = scaler.fit_transform(luxembourg_X_train)
luxembourg_X_test_scaled = scaler.transform(luxembourg_X_test)


In [None]:
from sklearn.linear_model import LinearRegression

# Create a linear regression model
linear_reg = LinearRegression()

# Use cross-validation to evaluate model performance (5-fold cross-validation) for Ireland
ireland_scores = cross_val_score(linear_reg, ireland_X_train_scaled, ireland_y_train, cv=5, scoring='neg_mean_squared_error')
ireland_rmse_scores = np.sqrt(-ireland_scores)
ireland_avg_rmse = ireland_rmse_scores.mean()

# Use cross-validation to evaluate model performance (5-fold cross-validation) for Luxembourg
luxembourg_scores = cross_val_score(linear_reg, luxembourg_X_train_scaled, luxembourg_y_train, cv=5, scoring='neg_mean_squared_error')
luxembourg_rmse_scores = np.sqrt(-luxembourg_scores)
luxembourg_avg_rmse = luxembourg_rmse_scores.mean()

print("Ireland RMSE:", ireland_avg_rmse)
print("Luxembourg RMSE:", luxembourg_avg_rmse)

# Train the model and make predictions for Ireland
linear_reg.fit(ireland_X_train_scaled, ireland_y_train)
ireland_y_pred = linear_reg.predict(ireland_X_test_scaled)

# Train the model and make predictions for Luxembourg
linear_reg.fit(luxembourg_X_train_scaled, luxembourg_y_train)
luxembourg_y_pred = linear_reg.predict(luxembourg_X_test_scaled)

# Evaluate model performance on Ireland test data
ireland_mse = mean_squared_error(ireland_y_test, ireland_y_pred)
ireland_mae = mean_absolute_error(ireland_y_test, ireland_y_pred)
ireland_r2 = r2_score(ireland_y_test, ireland_y_pred)

print("Ireland MSE:", ireland_mse)
print("Ireland MAE:", ireland_mae)
print("Ireland R-squared:", ireland_r2)

# Evaluate model performance on Luxembourg test data
luxembourg_mse = mean_squared_error(luxembourg_y_test, luxembourg_y_pred)
luxembourg_mae = mean_absolute_error(luxembourg_y_test, luxembourg_y_pred)
luxembourg_r2 = r2_score(luxembourg_y_test, luxembourg_y_pred)

print("Luxembourg MSE:", luxembourg_mse)
print("Luxembourg MAE:", luxembourg_mae)
print("Luxembourg R-squared:", luxembourg_r2)

In [None]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

# Create a Lasso regression model
lasso_reg = Lasso()

# Define a range of alpha values to search
param_grid = {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0]}  # Adjust the range as needed

# Create a GridSearchCV object with cross-validation (5-fold cross-validation)
lasso_grid_search = GridSearchCV(lasso_reg, param_grid, cv=5, scoring='neg_mean_squared_error')

# Perform the grid search on Ireland data
lasso_grid_search.fit(ireland_X_train_scaled, ireland_y_train)

# Get the best alpha value from the grid search
best_alpha_ireland = lasso_grid_search.best_params_['alpha']

# Train the Lasso model with the best alpha on Ireland data
lasso_reg_ireland = Lasso(alpha=best_alpha_ireland)
lasso_reg_ireland.fit(ireland_X_train_scaled, ireland_y_train)
ireland_y_pred = lasso_reg_ireland.predict(ireland_X_test_scaled)

# Evaluate model performance on Ireland test data
ireland_mse = mean_squared_error(ireland_y_test, ireland_y_pred)
ireland_mae = mean_absolute_error(ireland_y_test, ireland_y_pred)
ireland_r2 = r2_score(ireland_y_test, ireland_y_pred)

print("Ireland MSE:", ireland_mse)
print("Ireland MAE:", ireland_mae)
print("Ireland R-squared:", ireland_r2)

# Perform the grid search on Luxembourg data
lasso_grid_search.fit(luxembourg_X_train_scaled, luxembourg_y_train)

# Get the best alpha value from the grid search for Luxembourg
best_alpha_luxembourg = lasso_grid_search.best_params_['alpha']

# Train the Lasso model with the best alpha on Luxembourg data
lasso_reg_luxembourg = Lasso(alpha=best_alpha_luxembourg)
lasso_reg_luxembourg.fit(luxembourg_X_train_scaled, luxembourg_y_train)
luxembourg_y_pred = lasso_reg_luxembourg.predict(luxembourg_X_test_scaled)

# Evaluate model performance on Luxembourg test data
luxembourg_mse = mean_squared_error(luxembourg_y_test, luxembourg_y_pred)
luxembourg_mae = mean_absolute_error(luxembourg_y_test, luxembourg_y_pred)
luxembourg_r2 = r2_score(luxembourg_y_test, luxembourg_y_pred)

print("Luxembourg MSE:", luxembourg_mse)
print("Luxembourg MAE:", luxembourg_mae)
print("Luxembourg R-squared:", luxembourg_r2)

# Output the best alpha values for Ireland and Luxembourg
print("Best Alpha (Ireland):", best_alpha_ireland)
print("Best Alpha (Luxembourg):", best_alpha_luxembourg)


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

# Create a Ridge regression model
ridge_reg = Ridge()

# Define a range of alpha values to search (you can adjust the range)
param_grid = {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0]}  # Adjust alpha values as needed

# Create a GridSearchCV object with cross-validation (5-fold cross-validation)
ridge_grid_search = GridSearchCV(ridge_reg, param_grid, cv=5, scoring='neg_mean_squared_error')

# Perform the grid search on Ireland data
ridge_grid_search.fit(ireland_X_train_scaled, ireland_y_train)

# Get the best alpha value from the grid search
best_alpha_ireland = ridge_grid_search.best_params_['alpha']

# Train the Ridge model with the best alpha on Ireland data
ridge_reg_ireland = Ridge(alpha=best_alpha_ireland)
ridge_reg_ireland.fit(ireland_X_train_scaled, ireland_y_train)
ireland_y_pred = ridge_reg_ireland.predict(ireland_X_test_scaled)

# Evaluate model performance on Ireland test data
ireland_mse = mean_squared_error(ireland_y_test, ireland_y_pred)
ireland_mae = mean_absolute_error(ireland_y_test, ireland_y_pred)
ireland_r2 = r2_score(ireland_y_test, ireland_y_pred)

print("Ireland MSE (Ridge):", ireland_mse)
print("Ireland MAE (Ridge):", ireland_mae)
print("Ireland R-squared (Ridge):", ireland_r2)

# Perform the grid search on Luxembourg data
ridge_grid_search.fit(luxembourg_X_train_scaled, luxembourg_y_train)

# Get the best alpha value from the grid search for Luxembourg
best_alpha_luxembourg = ridge_grid_search.best_params_['alpha']

# Train the Ridge model with the best alpha on Luxembourg data
ridge_reg_luxembourg = Ridge(alpha=best_alpha_luxembourg)
ridge_reg_luxembourg.fit(luxembourg_X_train_scaled, luxembourg_y_train)
luxembourg_y_pred = ridge_reg_luxembourg.predict(luxembourg_X_test_scaled)

# Evaluate model performance on Luxembourg test data
luxembourg_mse = mean_squared_error(luxembourg_y_test, luxembourg_y_pred)
luxembourg_mae = mean_absolute_error(luxembourg_y_test, luxembourg_y_pred)
luxembourg_r2 = r2_score(luxembourg_y_test, luxembourg_y_pred)

print("Luxembourg MSE (Ridge):", luxembourg_mse)
print("Luxembourg MAE (Ridge):", luxembourg_mae)
print("Luxembourg R-squared (Ridge):", luxembourg_r2)

# Output the best alpha values for Ireland and Luxembourg
print("Best Alpha (Ireland):", best_alpha_ireland)
print("Best Alpha (Luxembourg):", best_alpha_luxembourg)


In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

# Create a DecisionTreeRegressor model
tree_reg = DecisionTreeRegressor()

# Define a range of hyperparameters to search (you can adjust the range)
param_grid = {'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10]}

# Create a GridSearchCV object with cross-validation (5-fold cross-validation)
tree_grid_search = GridSearchCV(tree_reg, param_grid, cv=5, scoring='neg_mean_squared_error')

# Perform the grid search on Ireland data
tree_grid_search.fit(ireland_X_train_scaled, ireland_y_train)

# Get the best hyperparameters from the grid search
best_params_ireland = tree_grid_search.best_params_

# Train the DecisionTreeRegressor model with the best hyperparameters on Ireland data
best_tree_reg_ireland = DecisionTreeRegressor(max_depth=best_params_ireland['max_depth'],
                                              min_samples_split=best_params_ireland['min_samples_split'])
best_tree_reg_ireland.fit(ireland_X_train_scaled, ireland_y_train)
ireland_y_pred = best_tree_reg_ireland.predict(ireland_X_test_scaled)

# Evaluate model performance on Ireland test data
ireland_mse = mean_squared_error(ireland_y_test, ireland_y_pred)
ireland_mae = mean_absolute_error(ireland_y_test, ireland_y_pred)
ireland_r2 = r2_score(ireland_y_test, ireland_y_pred)

print("Ireland MSE (Decision Tree):", ireland_mse)
print("Ireland MAE (Decision Tree):", ireland_mae)
print("Ireland R-squared (Decision Tree):", ireland_r2)

# Perform the grid search on Luxembourg data
tree_grid_search.fit(luxembourg_X_train_scaled, luxembourg_y_train)

# Get the best hyperparameters from the grid search for Luxembourg
best_params_luxembourg = tree_grid_search.best_params_

# Train the DecisionTreeRegressor model with the best hyperparameters on Luxembourg data
best_tree_reg_luxembourg = DecisionTreeRegressor(max_depth=best_params_luxembourg['max_depth'],
                                                min_samples_split=best_params_luxembourg['min_samples_split'])
best_tree_reg_luxembourg.fit(luxembourg_X_train_scaled, luxembourg_y_train)
luxembourg_y_pred = best_tree_reg_luxembourg.predict(luxembourg_X_test_scaled)

# Evaluate model performance on Luxembourg test data
luxembourg_mse = mean_squared_error(luxembourg_y_test, luxembourg_y_pred)
luxembourg_mae = mean_absolute_error(luxembourg_y_test, luxembourg_y_pred)
luxembourg_r2 = r2_score(luxembourg_y_test, luxembourg_y_pred)

print("Luxembourg MSE (Decision Tree):", luxembourg_mse)
print("Luxembourg MAE (Decision Tree):", luxembourg_mae)
print("Luxembourg R-squared (Decision Tree):", luxembourg_r2)

# Output the best hyperparameters for Ireland and Luxembourg
print("Best Hyperparameters (Ireland):", best_params_ireland)
print("Best Hyperparameters (Luxembourg):", best_params_luxembourg)


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Create a RandomForestRegressor model
rf_reg = RandomForestRegressor()

# Define a range of hyperparameters to search (you can adjust the range)
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of trees in the forest
    'max_depth': [None, 10, 20],  # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]  # Minimum number of samples required to be at a leaf node
}

# Create a GridSearchCV object with cross-validation (5-fold cross-validation)
rf_grid_search = GridSearchCV(rf_reg, param_grid, cv=5, scoring='neg_mean_squared_error')

# Perform the grid search on Ireland data
rf_grid_search.fit(ireland_X_train_scaled, ireland_y_train)

# Get the best hyperparameters from the grid search
best_params_ireland = rf_grid_search.best_params_

# Train the RandomForestRegressor model with the best hyperparameters on Ireland data
best_rf_reg_ireland = RandomForestRegressor(n_estimators=best_params_ireland['n_estimators'],
                                           max_depth=best_params_ireland['max_depth'],
                                           min_samples_split=best_params_ireland['min_samples_split'],
                                           min_samples_leaf=best_params_ireland['min_samples_leaf'])
best_rf_reg_ireland.fit(ireland_X_train_scaled, ireland_y_train)
ireland_y_pred = best_rf_reg_ireland.predict(ireland_X_test_scaled)

# Evaluate model performance on Ireland test data
ireland_mse = mean_squared_error(ireland_y_test, ireland_y_pred)
ireland_mae = mean_absolute_error(ireland_y_test, ireland_y_pred)
ireland_r2 = r2_score(ireland_y_test, ireland_y_pred)

print("Ireland MSE (Random Forest):", ireland_mse)
print("Ireland MAE (Random Forest):", ireland_mae)
print("Ireland R-squared (Random Forest):", ireland_r2)

# Perform the grid search on Luxembourg data
rf_grid_search.fit(luxembourg_X_train_scaled, luxembourg_y_train)

# Get the best hyperparameters from the grid search for Luxembourg
best_params_luxembourg = rf_grid_search.best_params_

# Train the RandomForestRegressor model with the best hyperparameters on Luxembourg data
best_rf_reg_luxembourg = RandomForestRegressor(n_estimators=best_params_luxembourg['n_estimators'],
                                              max_depth=best_params_luxembourg['max_depth'],
                                              min_samples_split=best_params_luxembourg['min_samples_split'],
                                              min_samples_leaf=best_params_luxembourg['min_samples_leaf'])
best_rf_reg_luxembourg.fit(luxembourg_X_train_scaled, luxembourg_y_train)
luxembourg_y_pred = best_rf_reg_luxembourg.predict(luxembourg_X_test_scaled)

# Evaluate model performance on Luxembourg test data
luxembourg_mse = mean_squared_error(luxembourg_y_test, luxembourg_y_pred)
luxembourg_mae = mean_absolute_error(luxembourg_y_test, luxembourg_y_pred)
luxembourg_r2 = r2_score(luxembourg_y_test, luxembourg_y_pred)

print("Luxembourg MSE (Random Forest):", luxembourg_mse)
print("Luxembourg MAE (Random Forest):", luxembourg_mae)
print("Luxembourg R-squared (Random Forest):", luxembourg_r2)

# Output the best hyperparameters for Ireland and Luxembourg
print("Best Hyperparameters (Ireland):", best_params_ireland)
print("Best Hyperparameters (Luxembourg):", best_params_luxembourg)


In [None]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

# Create an SVR model
svr_reg = SVR()

# Define a range of hyperparameters to search (you can adjust the range)
param_grid = {
    'kernel': ['linear', 'poly', 'rbf'],  # Kernel type
    'C': [0.1, 1, 10],  # Regularization parameter C
    'epsilon': [0.01, 0.1, 0.2]  # Epsilon in the epsilon-SVR model
}

# Create a GridSearchCV object with cross-validation (5-fold cross-validation)
svr_grid_search = GridSearchCV(svr_reg, param_grid, cv=5, scoring='neg_mean_squared_error')

# Perform the grid search on Ireland data
svr_grid_search.fit(ireland_X_train_scaled, ireland_y_train)

# Get the best hyperparameters from the grid search
best_params_ireland = svr_grid_search.best_params_

# Train the SVR model with the best hyperparameters on Ireland data
best_svr_reg_ireland = SVR(kernel=best_params_ireland['kernel'],
                           C=best_params_ireland['C'],
                           epsilon=best_params_ireland['epsilon'])
best_svr_reg_ireland.fit(ireland_X_train_scaled, ireland_y_train)
ireland_y_pred = best_svr_reg_ireland.predict(ireland_X_test_scaled)

# Evaluate model performance on Ireland test data
ireland_mse = mean_squared_error(ireland_y_test, ireland_y_pred)
ireland_mae = mean_absolute_error(ireland_y_test, ireland_y_pred)
ireland_r2 = r2_score(ireland_y_test, ireland_y_pred)

print("Ireland MSE (SVR):", ireland_mse)
print("Ireland MAE (SVR):", ireland_mae)
print("Ireland R-squared (SVR):", ireland_r2)

# Perform the grid search on Luxembourg data
svr_grid_search.fit(luxembourg_X_train_scaled, luxembourg_y_train)

# Get the best hyperparameters from the grid search for Luxembourg
best_params_luxembourg = svr_grid_search.best_params_

# Train the SVR model with the best hyperparameters on Luxembourg data
best_svr_reg_luxembourg = SVR(kernel=best_params_luxembourg['kernel'],
                              C=best_params_luxembourg['C'],
                              epsilon=best_params_luxembourg['epsilon'])
best_svr_reg_luxembourg.fit(luxembourg_X_train_scaled, luxembourg_y_train)
luxembourg_y_pred = best_svr_reg_luxembourg.predict(luxembourg_X_test_scaled)

# Evaluate model performance on Luxembourg test data
luxembourg_mse = mean_squared_error(luxembourg_y_test, luxembourg_y_pred)
luxembourg_mae = mean_absolute_error(luxembourg_y_test, luxembourg_y_pred)
luxembourg_r2 = r2_score(luxembourg_y_test, luxembourg_y_pred)

print("Luxembourg MSE (SVR):", luxembourg_mse)
print("Luxembourg MAE (SVR):", luxembourg_mae)
print("Luxembourg R-squared (SVR):", luxembourg_r2)

# Output the best hyperparameters for Ireland and Luxembourg
print("Best Hyperparameters (Ireland):", best_params_ireland)
print("Best Hyperparameters (Luxembourg):", best_params_luxembourg)


In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Create a KNeighborsRegressor model
knn_reg = KNeighborsRegressor()

# Define a range of hyperparameters to search (you can adjust the range)
param_grid = {
    'n_neighbors': [3, 5, 7],  # Number of neighbors to consider
    'weights': ['uniform', 'distance'],  # Weighting method
    'p': [1, 2]  # Power parameter for distance metric
}

# Create a GridSearchCV object with cross-validation (5-fold cross-validation)
knn_grid_search = GridSearchCV(knn_reg, param_grid, cv=5, scoring='neg_mean_squared_error')

# Perform the grid search on Ireland data
knn_grid_search.fit(ireland_X_train_scaled, ireland_y_train)

# Get the best hyperparameters from the grid search
best_params_ireland = knn_grid_search.best_params_

# Train the KNeighborsRegressor model with the best hyperparameters on Ireland data
best_knn_reg_ireland = KNeighborsRegressor(n_neighbors=best_params_ireland['n_neighbors'],
                                           weights=best_params_ireland['weights'],
                                           p=best_params_ireland['p'])
best_knn_reg_ireland.fit(ireland_X_train_scaled, ireland_y_train)
ireland_y_pred = best_knn_reg_ireland.predict(ireland_X_test_scaled)

# Evaluate model performance on Ireland test data
ireland_mse = mean_squared_error(ireland_y_test, ireland_y_pred)
ireland_mae = mean_absolute_error(ireland_y_test, ireland_y_pred)
ireland_r2 = r2_score(ireland_y_test, ireland_y_pred)

print("Ireland MSE (KNN):", ireland_mse)
print("Ireland MAE (KNN):", ireland_mae)
print("Ireland R-squared (KNN):", ireland_r2)

# Perform the grid search on Luxembourg data
knn_grid_search.fit(luxembourg_X_train_scaled, luxembourg_y_train)

# Get the best hyperparameters from the grid search for Luxembourg
best_params_luxembourg = knn_grid_search.best_params_

# Train the KNeighborsRegressor model with the best hyperparameters on Luxembourg data
best_knn_reg_luxembourg = KNeighborsRegressor(n_neighbors=best_params_luxembourg['n_neighbors'],
                                              weights=best_params_luxembourg['weights'],
                                              p=best_params_luxembourg['p'])
best_knn_reg_luxembourg.fit(luxembourg_X_train_scaled, luxembourg_y_train)
luxembourg_y_pred = best_knn_reg_luxembourg.predict(luxembourg_X_test_scaled)

# Evaluate model performance on Luxembourg test data
luxembourg_mse = mean_squared_error(luxembourg_y_test, luxembourg_y_pred)
luxembourg_mae = mean_absolute_error(luxembourg_y_test, luxembourg_y_pred)
luxembourg_r2 = r2_score(luxembourg_y_test, luxembourg_y_pred)

print("Luxembourg MSE (KNN):", luxembourg_mse)
print("Luxembourg MAE (KNN):", luxembourg_mae)
print("Luxembourg R-squared (KNN):", luxembourg_r2)

# Output the best hyperparameters for Ireland and Luxembourg
print("Best Hyperparameters (Ireland):", best_params_ireland)
print("Best Hyperparameters (Luxembourg):", best_params_luxembourg)



In [None]:
# Create a RandomForestRegressor model to calculate feature importance for Ireland
rf_reg_ireland = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model to Ireland data to calculate feature importance
rf_reg_ireland.fit(ireland_X_train_scaled, ireland_y_train)

# Get feature importances for Ireland
feature_importance_ireland = rf_reg_ireland.feature_importances_

# Create a dictionary of feature names and their importance scores for Ireland
feature_importance_dict_ireland = dict(zip(ireland_X.columns, feature_importance_ireland))

# Sort features by importance (optional) for Ireland
sorted_features_ireland = sorted(feature_importance_dict_ireland.items(), key=lambda x: x[1], reverse=True)

# Select the top N important features or use a threshold for Ireland
top_n_ireland = 4  # Choose the top 5 important features for Ireland
selected_features_ireland = [feature[0] for feature in sorted_features_ireland[:top_n_ireland]]

# Use selected features for modeling for Ireland (replace ireland_X_train_scaled and ireland_X_test_scaled)
ireland_X_train_scaled_selected = ireland_X_train_scaled[:, [ireland_X.columns.get_loc(col) for col in selected_features_ireland]]
ireland_X_test_scaled_selected = ireland_X_test_scaled[:, [ireland_X.columns.get_loc(col) for col in selected_features_ireland]]

In [None]:
# Create a RandomForestRegressor model to calculate feature importance for Luxembourg
rf_reg_luxembourg = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model to Luxembourg data to calculate feature importance
rf_reg_luxembourg.fit(luxembourg_X_train_scaled, luxembourg_y_train)

# Get feature importances for Luxembourg
feature_importance_luxembourg = rf_reg_luxembourg.feature_importances_

# Create a dictionary of feature names and their importance scores for Luxembourg
feature_importance_dict_luxembourg = dict(zip(luxembourg_X.columns, feature_importance_luxembourg))

# Sort features by importance (optional) for Luxembourg
sorted_features_luxembourg = sorted(feature_importance_dict_luxembourg.items(), key=lambda x: x[1], reverse=True)

# Select the top N important features or use a threshold for Luxembourg
top_n_luxembourg = 4  # Choose the top 5 important features for Luxembourg
selected_features_luxembourg = [feature[0] for feature in sorted_features_luxembourg[:top_n_luxembourg]]

# Use selected features for modeling for Luxembourg (replace luxembourg_X_train_scaled and luxembourg_X_test_scaled)
luxembourg_X_train_scaled_selected = luxembourg_X_train_scaled[:, [luxembourg_X.columns.get_loc(col) for col in selected_features_luxembourg]]
luxembourg_X_test_scaled_selected = luxembourg_X_test_scaled[:, [luxembourg_X.columns.get_loc(col) for col in selected_features_luxembourg]]

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Create a RandomForestRegressor model
rf_reg = RandomForestRegressor()

# Define a range of hyperparameters to search (you can adjust the range)
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of trees in the forest
    'max_depth': [1,2,3,4],  # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]  # Minimum number of samples required to be at a leaf node
}

# Create a GridSearchCV object with cross-validation (5-fold cross-validation)
rf_grid_search = GridSearchCV(rf_reg, param_grid, cv=5, scoring='neg_mean_squared_error')

# Perform the grid search on Ireland data
rf_grid_search.fit(ireland_X_train_scaled_selected, ireland_y_train)

# Get the best hyperparameters from the grid search
best_params_ireland = rf_grid_search.best_params_

# Train the RandomForestRegressor model with the best hyperparameters on Ireland data
best_rf_reg_ireland = RandomForestRegressor(n_estimators=best_params_ireland['n_estimators'],
                                           max_depth=best_params_ireland['max_depth'],
                                           min_samples_split=best_params_ireland['min_samples_split'],
                                           min_samples_leaf=best_params_ireland['min_samples_leaf'],
                                           random_state=42)  # Adding random_state for reproducibility
best_rf_reg_ireland.fit(ireland_X_train_scaled_selected, ireland_y_train)
ireland_y_pred = best_rf_reg_ireland.predict(ireland_X_test_scaled_selected)

# Evaluate model performance on Ireland test data
ireland_mse = mean_squared_error(ireland_y_test, ireland_y_pred)
ireland_mae = mean_absolute_error(ireland_y_test, ireland_y_pred)
ireland_r2 = r2_score(ireland_y_test, ireland_y_pred)

print("Ireland MSE (Random Forest):", ireland_mse)
print("Ireland MAE (Random Forest):", ireland_mae)
print("Ireland R-squared (Random Forest):", ireland_r2)

# Perform the grid search on Luxembourg data
rf_grid_search.fit(luxembourg_X_train_scaled_selected, luxembourg_y_train)

# Get the best hyperparameters from the grid search for Luxembourg
best_params_luxembourg = rf_grid_search.best_params_

# Train the RandomForestRegressor model with the best hyperparameters on Luxembourg data
best_rf_reg_luxembourg = RandomForestRegressor(n_estimators=best_params_luxembourg['n_estimators'],
                                              max_depth=best_params_luxembourg['max_depth'],
                                              min_samples_split=best_params_luxembourg['min_samples_split'],
                                              min_samples_leaf=best_params_luxembourg['min_samples_leaf'],
                                              random_state=42)  # Adding random_state for reproducibility
best_rf_reg_luxembourg.fit(luxembourg_X_train_scaled_selected, luxembourg_y_train)
luxembourg_y_pred = best_rf_reg_luxembourg.predict(luxembourg_X_test_scaled_selected)

# Evaluate model performance on Luxembourg test data
luxembourg_mse = mean_squared_error(luxembourg_y_test, luxembourg_y_pred)
luxembourg_mae = mean_absolute_error(luxembourg_y_test, luxembourg_y_pred)
luxembourg_r2 = r2_score(luxembourg_y_test, luxembourg_y_pred)

print("Luxembourg MSE (Random Forest):", luxembourg_mse)
print("Luxembourg MAE (Random Forest):", luxembourg_mae)
print("Luxembourg R-squared (Random Forest):", luxembourg_r2)

# Output the best hyperparameters for Ireland and Luxembourg
print("Best Hyperparameters (Ireland):", best_params_ireland)
print("Best Hyperparameters (Luxembourg):", best_params_luxembourg)


In [None]:
# Create a RandomForestRegressor model to calculate feature importance for Ireland
rf_reg_ireland = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model to Ireland data to calculate feature importance
rf_reg_ireland.fit(ireland_X_train_scaled, ireland_y_train)

# Get feature importances for Ireland
feature_importance_ireland = rf_reg_ireland.feature_importances_

# Create a dictionary of feature names and their importance scores for Ireland
feature_importance_dict_ireland = dict(zip(ireland_X.columns, feature_importance_ireland))

# Sort features by importance (optional) for Ireland
sorted_features_ireland = sorted(feature_importance_dict_ireland.items(), key=lambda x: x[1], reverse=True)

# Select the top N important features or use a threshold for Ireland
top_n_ireland = 3  # Choose the top 5 important features for Ireland
selected_features_ireland = [feature[0] for feature in sorted_features_ireland[:top_n_ireland]]

# Use selected features for modeling for Ireland (replace ireland_X_train_scaled and ireland_X_test_scaled)
ireland_X_train_scaled_selected = ireland_X_train_scaled[:, [ireland_X.columns.get_loc(col) for col in selected_features_ireland]]
ireland_X_test_scaled_selected = ireland_X_test_scaled[:, [ireland_X.columns.get_loc(col) for col in selected_features_ireland]]

In [None]:
# Create a RandomForestRegressor model to calculate feature importance for Luxembourg
rf_reg_luxembourg = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model to Luxembourg data to calculate feature importance
rf_reg_luxembourg.fit(luxembourg_X_train_scaled, luxembourg_y_train)

# Get feature importances for Luxembourg
feature_importance_luxembourg = rf_reg_luxembourg.feature_importances_

# Create a dictionary of feature names and their importance scores for Luxembourg
feature_importance_dict_luxembourg = dict(zip(luxembourg_X.columns, feature_importance_luxembourg))

# Sort features by importance (optional) for Luxembourg
sorted_features_luxembourg = sorted(feature_importance_dict_luxembourg.items(), key=lambda x: x[1], reverse=True)

# Select the top N important features or use a threshold for Luxembourg
top_n_luxembourg = 3  # Choose the top 5 important features for Luxembourg
selected_features_luxembourg = [feature[0] for feature in sorted_features_luxembourg[:top_n_luxembourg]]

# Use selected features for modeling for Luxembourg (replace luxembourg_X_train_scaled and luxembourg_X_test_scaled)
luxembourg_X_train_scaled_selected = luxembourg_X_train_scaled[:, [luxembourg_X.columns.get_loc(col) for col in selected_features_luxembourg]]
luxembourg_X_test_scaled_selected = luxembourg_X_test_scaled[:, [luxembourg_X.columns.get_loc(col) for col in selected_features_luxembourg]]

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Create a RandomForestRegressor model
rf_reg = RandomForestRegressor()

# Define a range of hyperparameters to search (you can adjust the range)
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of trees in the forest
    'max_depth': [1,2,3,4],  # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]  # Minimum number of samples required to be at a leaf node
}

# Create a GridSearchCV object with cross-validation (5-fold cross-validation)
rf_grid_search = GridSearchCV(rf_reg, param_grid, cv=5, scoring='neg_mean_squared_error')

# Perform the grid search on Ireland data
rf_grid_search.fit(ireland_X_train_scaled_selected, ireland_y_train)

# Get the best hyperparameters from the grid search
best_params_ireland = rf_grid_search.best_params_

# Train the RandomForestRegressor model with the best hyperparameters on Ireland data
best_rf_reg_ireland = RandomForestRegressor(n_estimators=best_params_ireland['n_estimators'],
                                           max_depth=best_params_ireland['max_depth'],
                                           min_samples_split=best_params_ireland['min_samples_split'],
                                           min_samples_leaf=best_params_ireland['min_samples_leaf'],
                                           random_state=42)  # Adding random_state for reproducibility
best_rf_reg_ireland.fit(ireland_X_train_scaled_selected, ireland_y_train)
ireland_y_pred = best_rf_reg_ireland.predict(ireland_X_test_scaled_selected)

# Evaluate model performance on Ireland test data
ireland_mse = mean_squared_error(ireland_y_test, ireland_y_pred)
ireland_mae = mean_absolute_error(ireland_y_test, ireland_y_pred)
ireland_r2 = r2_score(ireland_y_test, ireland_y_pred)

print("Ireland MSE (Random Forest):", ireland_mse)
print("Ireland MAE (Random Forest):", ireland_mae)
print("Ireland R-squared (Random Forest):", ireland_r2)

# Perform the grid search on Luxembourg data
rf_grid_search.fit(luxembourg_X_train_scaled_selected, luxembourg_y_train)

# Get the best hyperparameters from the grid search for Luxembourg
best_params_luxembourg = rf_grid_search.best_params_

# Train the RandomForestRegressor model with the best hyperparameters on Luxembourg data
best_rf_reg_luxembourg = RandomForestRegressor(n_estimators=best_params_luxembourg['n_estimators'],
                                              max_depth=best_params_luxembourg['max_depth'],
                                              min_samples_split=best_params_luxembourg['min_samples_split'],
                                              min_samples_leaf=best_params_luxembourg['min_samples_leaf'],
                                              random_state=42)  # Adding random_state for reproducibility
best_rf_reg_luxembourg.fit(luxembourg_X_train_scaled_selected, luxembourg_y_train)
luxembourg_y_pred = best_rf_reg_luxembourg.predict(luxembourg_X_test_scaled_selected)

# Evaluate model performance on Luxembourg test data
luxembourg_mse = mean_squared_error(luxembourg_y_test, luxembourg_y_pred)
luxembourg_mae = mean_absolute_error(luxembourg_y_test, luxembourg_y_pred)
luxembourg_r2 = r2_score(luxembourg_y_test, luxembourg_y_pred)

print("Luxembourg MSE (Random Forest):", luxembourg_mse)
print("Luxembourg MAE (Random Forest):", luxembourg_mae)
print("Luxembourg R-squared (Random Forest):", luxembourg_r2)

# Output the best hyperparameters for Ireland and Luxembourg
print("Best Hyperparameters (Ireland):", best_params_ireland)
print("Best Hyperparameters (Luxembourg):", best_params_luxembourg)


In [1]:
import praw
import pandas as pd
from dotenv import load_dotenv
from os import getenv
from datetime import datetime as dt

load_dotenv()

reddit = praw.Reddit(
    client_id=getenv("APP_ID"),
    client_secret=getenv("APP_SECRET"),
    user_agent=f"{getenv('APP_NAME')} u/{getenv('REDDIT_USERNAME')}",
)

# 要搜索的关键字
keyword = "public transport in Ireland"

# 要搜索的子论坛
subreddit_name = "irishtourism"

# 执行搜索
subreddit = reddit.subreddit(subreddit_name)
search_results = subreddit.search(keyword, limit=None)  # limit=None 表示获取所有搜索结果

# 遍历搜索结果并打印帖子标题和链接
for submission in search_results:
    print(f"标题: {submission.title}")
    print(f"链接: {submission.url}")
    print("-" * 50)
    


标题: Experiences using Public transport in Ireland
链接: https://www.reddit.com/r/irishtourism/comments/17y3yl5/experiences_using_public_transport_in_ireland/
--------------------------------------------------
标题: How good is public transport in Ireland?
链接: https://www.reddit.com/r/irishtourism/comments/v05dsw/how_good_is_public_transport_in_ireland/
--------------------------------------------------
标题: Going to Ireland on foot with a friend of mine to travel and do some WWOOfing in Burren. Is bus the main public transport in Ireland or do you have trains as well ? Any advice before we discover the country ? We will mainly visit the East coast (Killarney, Dingle, Aran Isles).
链接: https://www.reddit.com/r/irishtourism/comments/15oc1bl/going_to_ireland_on_foot_with_a_friend_of_mine_to/
--------------------------------------------------
标题: Is there a way to get a list of all possible public transport from a town in Ireland?
链接: https://www.reddit.com/r/irishtourism/comments/176dt1i/is_the

标题: Need itinerary advice - beginning planning stages of a 8 to 9 day trip in early October
链接: https://www.reddit.com/r/irishtourism/comments/10nyk2a/need_itinerary_advice_beginning_planning_stages/
--------------------------------------------------
标题: Bikes for Hire with toddler seats in Dublin and/or Derry? Also best playgrounds and parks?
链接: https://www.reddit.com/r/irishtourism/comments/173c3r8/bikes_for_hire_with_toddler_seats_in_dublin_andor/
--------------------------------------------------
标题: Tips about Bus Eireann 275 (Tralee - Dingle) and 350 (Ennis - Doolin)
链接: https://www.reddit.com/r/irishtourism/comments/voxpo0/tips_about_bus_eireann_275_tralee_dingle_and_350/
--------------------------------------------------
标题: 12 nights in October
链接: https://www.reddit.com/r/irishtourism/comments/xyhxsi/12_nights_in_october/
--------------------------------------------------
标题: Recommendations for trip to Ireland
链接: https://www.reddit.com/r/irishtourism/comments/14wvkbl/recom

标题: Travelling to Dublin in August. Would like to know a couple of things before flying there. This was an impulsive decision so not very prepared for the trip.
链接: https://www.reddit.com/r/irishtourism/comments/opj317/travelling_to_dublin_in_august_would_like_to_know/
--------------------------------------------------
标题: What to do with 2 days before an October wedding in Slane
链接: https://www.reddit.com/r/irishtourism/comments/w5u1m6/what_to_do_with_2_days_before_an_october_wedding/
--------------------------------------------------
标题: Current COVID vac requirements
链接: https://www.reddit.com/r/irishtourism/comments/xpmpxu/current_covid_vac_requirements/
--------------------------------------------------
标题: Visiting in March 2023
链接: https://www.reddit.com/r/irishtourism/comments/10qeu98/visiting_in_march_2023/
--------------------------------------------------
标题: Shannon itinerary advice!
链接: https://www.reddit.com/r/irishtourism/comments/w3mbke/shannon_itinerary_advice/
-------

In [2]:
# Get the unique identifier for a specific post (copy it from the search results)
submission_id = "17y3yl5"

# Get the specific post
submission = reddit.submission(id=submission_id)


submission.comments.replace_more(limit=None)  # 获取所有评论
comments = submission.comments.list()


In [3]:
#pip install pymongo

In [4]:
import praw
import pymongo
from dotenv import load_dotenv
from os import getenv
import json

load_dotenv()

# 连接到 MongoDB 数据库
MONGODB_URI =getenv("MONGODB_URI")
base_name = getenv("BASENAME")
collection_name = getenv("collection_name") 
client = pymongo.MongoClient(MONGODB_URI)  
db = client[base_name]  
collection = db[collection_name] 

# Iterate through comments and store them in MongoDB
for comment in comments:
    # Create a dictionary to store comment data
    comment_data = {
        "author": comment.author.name if comment.author else "Unknown",
        "body": comment.body,
        "created_utc": comment.created_utc,
        # Add other necessary fields as needed
    }

    # Convert comment data to JSON format
    comment_json = json.dumps(comment_data)

    # Print JSON data
    print(comment_json)

    # Store comment JSON data in MongoDB
    collection.insert_one(json.loads(comment_json))

# Close the MongoDB connection
client.close()

{"author": "trippiler", "body": "Bus eireann extremely unreliable Allow a lot of extra time if you need to be on time and avoid for intercounty travel (very slow). Private buses to and from Dublin oversubscribed (Aircoach, Gobus, etc.) and need to be booked in advance. Expressway really overpriced. \n\nLuas, Dublin Bus, Dart and Irishrail are decent. Irishrail and Dart are a bit outdated", "created_utc": 1700307341.0}
{"author": "maevewiley554", "body": "Yeah even in towns like Athlone that got the new electric buses and there\u2019s a town bus that supposed to be going every half hour and yet people are just left standing in the rain not knowing when it will come. It would be nice if they could at least announce bus route cancellations or delays so people could have the chance to make alternative plans rather than waiting for a bus that\u2019s not due to show up for another hour or two.", "created_utc": 1700317589.0}
{"author": "Roseha-aka-rosephoto", "body": "Last year I was supposed

In [5]:
from datetime import datetime as dt
comments_list = []
time_list = []

for comment in comments:
    comment_body = comment.body
    comment_time = dt.fromtimestamp(comment.created_utc)
    comments_list.append(comment_body)
    time_list.append(comment_time)
    

# 创建DataFrame对象
df = pd.DataFrame({'Comment': comments_list, 'Post Time': time_list})

# 打印DataFrame
print(df)

                                              Comment           Post Time
0   Bus eireann extremely unreliable Allow a lot o... 2023-11-18 11:35:41
1   Yeah even in towns like Athlone that got the n... 2023-11-18 14:26:29
2   Last year I was supposed to get a car service ... 2023-11-18 19:33:15
3   Irish Rail: Good but too overcrowded during pe... 2023-11-18 11:05:28
4   > Can we get a more balanced and objective vie... 2023-11-18 14:49:37
..                                                ...                 ...
57  Will see what they come back with, it can be e... 2023-11-19 12:16:24
58  Grand so, no rush on it like, take your time, ... 2023-11-19 11:04:26
59  The trains run a bit later Monday- Saturday in... 2023-11-18 20:33:57
60  I don’t have personal experience of that route... 2023-11-19 13:20:08
61  Grand will do- I just need to figure out how t... 2023-11-19 11:08:06

[62 rows x 2 columns]


In [7]:
df.to_csv("C:\\Users\\luozh\\CA2\\comments.csv")

In [None]:
import praw
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import nltk
# Download NLTK's stopwords and word tokenization data
nltk.download('stopwords')
nltk.download('punkt')

# Initialize the stemmer and stop words list
stemmer = PorterStemmer()
stop_words = set(stopwords.words("english"))


# Preprocess comments
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters, punctuation, and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize the text
    words = nltk.word_tokenize(text)
    # Remove stopwords and perform stemming
    filtered_words = [stemmer.stem(word) for word in words if word not in stop_words]
    return " ".join(filtered_words)

df['Preprocessed Comment'] = df['Comment'].apply(preprocess_text)

# Print the DataFrame with preprocessed comments
print(df)

In [None]:
#pip install textblob

In [None]:
from textblob import TextBlob

In [None]:
# 对评论进行情感分析
def analyze_sentiment(comment):
    analysis = TextBlob(comment)
    if analysis.sentiment.polarity > 0:
        return "Positive"
    elif analysis.sentiment.polarity < 0:
        return "Negative"
    else:
        return "Neutral"

df['Sentiment'] = df['Comment'].apply(analyze_sentiment)

# 打印带有情感分析结果的DataFrame
print(df)

In [None]:
#pip install vaderSentiment

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
# 创建情感分析器
analyzer = SentimentIntensityAnalyzer()

# 对评论进行情感分析，并将情感极性添加到DataFrame中
def analyze_sentiment(comment):
    sentiment = analyzer.polarity_scores(comment)
    return sentiment['compound']  # compound值表示综合情感极性

df['Sentiment'] = df['Comment'].apply(analyze_sentiment)

# 打印带有情感分析结果的DataFrame
print(df)

In [None]:
def classify_sentiment(compound_score, pos_threshold=0.05, neg_threshold=-0.05):
    if compound_score >= pos_threshold:
        return "Positive"
    elif compound_score <= neg_threshold:
        return "Negative"
    else:
        return "Neutral"

df['Sentiment_Label'] = df['Sentiment'].apply(classify_sentiment)

# 打印带有情感分类的DataFrame
print(df)
