In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
#加载csv文件
df = pd.read_csv('global-co2-fossil-plus-land-use.csv')

#检查该数据集的基本信息
print("数据集基本信息：")
print(df.info())
print("\n数据集统计描述：")
print(df.describe())

#查看前几行数据
print("\n数据预览：")
print(df.head())

数据集基本信息：
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42604 entries, 0 to 42603
Data columns (total 6 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Entity                                          42604 non-null  object 
 1   Code                                            37615 non-null  object 
 2   Year                                            42604 non-null  int64  
 3   Annual CO₂ emissions                            27434 non-null  float64
 4   Annual CO₂ emissions including land-use change  24212 non-null  float64
 5   Annual CO₂ emissions from land-use change       39388 non-null  float64
dtypes: float64(3), int64(1), object(2)
memory usage: 2.0+ MB
None

数据集统计描述：
               Year  Annual CO₂ emissions  \
count  42604.000000          2.743400e+04   
mean    1936.939137          4.332293e+08   
std       49.849409          1.918101e+09   
min     18

In [2]:
#由于年份范围太广，现在筛选出其中2011-2021年份的数据
df_filtered = df[(df['Year'] >= 2011) & (df['Year'] <= 2021)]

#Code列可以去除不影响数据分析
df_filtered = df_filtered.drop(columns=['Code'])

#检查缺失值
print("缺失值统计：")
print(df_filtered.isnull().sum())
#检查重复值
print("重复值的数量：",df_filtered.duplicated().sum())

#处理缺失值：删除含有缺失值的行
df_filtered.dropna(inplace=True)


缺失值统计：
Entity                                              0
Year                                                0
Annual CO₂ emissions                              132
Annual CO₂ emissions including land-use change    473
Annual CO₂ emissions from land-use change         341
dtype: int64
重复值的数量： 0


In [3]:
#异常值检测：利用IQR方法剔除异常值
def remove_outliers(df,col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3-Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

related_cols = ['Annual CO₂ emissions','Annual CO₂ emissions including land-use change',
               'Annual CO₂ emissions from land-use change']
for col in related_cols:
    if col in df_filtered.columns:
        df_filtered = remove_outliers(df_filtered,col)

In [10]:
#标准化：Z-score标准化
#scaler = StandardScaler()
#df_filtered['Annual CO₂ emissions_标准化'] = scaler.fit_transform(df_filtered[['Annual CO₂ emissions']])
#df_filtered['Annual CO₂ emissions including land-use change_标准化'] = scaler.fit_transform(df_filtered[['Annual CO₂ emissions including land-use change']])
#df_filtered['Annual CO₂ emissions from land-use change_标准化'] = scaler.fit_transform(df_filtered[['Annual CO₂ emissions from land-use change']])
    
#归一化：Min-Max 归一化
#min_max_scaler = MinMaxScaler()
#df_filtered['Annual CO₂ emissions_归一化'] = min_max_scaler.fit_transform(df_filtered[['Annual CO₂ emissions']])
#df_filtered['Annual CO₂ emissions including land-use change_归一化'] = min_max_scaler.fit_transform(df_filtered[['Annual CO₂ emissions including land-use change']])
#df_filtered['Annual CO₂ emissions from land-use change_归一化'] = min_max_scaler.fit_transform(df_filtered[['Annual CO₂ emissions from land-use change']])

In [4]:
#最后保存文件
df_filtered.to_csv('filtered_data.csv',index=False)

# 第二个数据集

In [3]:
#加载csv文件
df = pd.read_csv('E:\数据\Carbon_(CO2)_Emissions_by_Country.csv')

#检查该数据集的基本信息
print("数据集基本信息：")
print(df.info())
print("\n数据集统计描述：")
print(df.describe())

#查看前几行数据
print("\n数据预览：")
print(df.head())

数据集基本信息：
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5677 entries, 0 to 5676
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Country                 5677 non-null   object 
 1   Region                  5677 non-null   object 
 2   Date                    5677 non-null   object 
 3   Kilotons of Co2         5677 non-null   float64
 4   Metric Tons Per Capita  5677 non-null   float64
dtypes: float64(2), object(3)
memory usage: 221.9+ KB
None

数据集统计描述：
       Kilotons of Co2  Metric Tons Per Capita
count     5.677000e+03             5677.000000
mean      1.412292e+05                4.325505
std       6.491258e+05                5.503834
min       0.000000e+00                0.000000
25%       1.380000e+03                0.570000
50%       9.170000e+03                2.360000
75%       5.848000e+04                6.340000
max       1.070722e+07               47.650000

数据预览：
       Country 

In [4]:
# 确保Date列是日期格式
df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y')

# 按照Country和Date排序
df_sorted = df.sort_values(by=['Country', 'Date'])

#检查缺失值
print("缺失值统计：")
print(df_sorted.isnull().sum())
#检查重复值
print("重复值的数量：",df_sorted.duplicated().sum())

#缺失值和重复值不存在不用处理

#异常值检测：利用IQR方法剔除异常值
def remove_outliers(df,col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3-Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

related_cols = ['Kilotons of Co2','Metric Tons Per Capita']
for col in related_cols:
    if col in df_sorted.columns:
        df_sorted = remove_outliers(df_sorted,col)
        
#标准化：Z-score标准化
scaler = StandardScaler()
df_sorted['Kilotons of Co2_标准化'] = scaler.fit_transform(df_sorted[['Kilotons of Co2']])
df_sorted['Metric Tons Per Capita_标准化'] = scaler.fit_transform(df_sorted[['Metric Tons Per Capita']])

    
#归一化：Min-Max 归一化
min_max_scaler = MinMaxScaler()
df_sorted['Kilotons of Co2_归一化'] = min_max_scaler.fit_transform(df_sorted[['Kilotons of Co2']])
df_sorted['Metric Tons Per Capita_归一化'] = min_max_scaler.fit_transform(df_sorted[['Metric Tons Per Capita']])

缺失值统计：
Country                   0
Region                    0
Date                      0
Kilotons of Co2           0
Metric Tons Per Capita    0
dtype: int64
重复值的数量： 0


In [5]:
#最后保存文件
df_sorted.to_csv('sorted_file.csv', index=False)

# 第三个数据集

In [6]:
#加载csv文件
df = pd.read_csv('E:\数据\global-data-on-sustainable-energy (1).csv')

#由于只有其中一些数据需要用到，所以要删除一些列，定义要保留的列
keep = ['Entity','Year','Renewable-electricity-generating-capacity-per-capita','Renewable energy share in the total final energy consumption (%)',
       'Electricity from fossil fuels (TWh)','Electricity from nuclear (TWh)','Electricity from renewables (TWh)','Primary energy consumption per capita (kWh/person)',
       'Value_co2_emissions_kt_by_country','gdp_growth','gdp_per_capita']

df = df[keep]

#检查该数据集的基本信息
print("数据集基本信息：")
print(df.info())
print("\n数据集统计描述：")
print(df.describe())

#查看前几行数据
print("\n数据预览：")
print(df.head())

数据集基本信息：
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3649 entries, 0 to 3648
Data columns (total 11 columns):
 #   Column                                                            Non-Null Count  Dtype  
---  ------                                                            --------------  -----  
 0   Entity                                                            3649 non-null   object 
 1   Year                                                              3649 non-null   int64  
 2   Renewable-electricity-generating-capacity-per-capita              2718 non-null   float64
 3   Renewable energy share in the total final energy consumption (%)  3455 non-null   float64
 4   Electricity from fossil fuels (TWh)                               3628 non-null   float64
 5   Electricity from nuclear (TWh)                                    3523 non-null   float64
 6   Electricity from renewables (TWh)                                 3628 non-null   float64
 7   Primary energy consumpti

In [7]:
#检查缺失值
print("缺失值统计：")
print(df.isnull().sum())
#检查重复值
print("重复值的数量：",df.duplicated().sum())

#处理缺失值-用中位数进行填充
df['Renewable-electricity-generating-capacity-per-capita'].fillna(df['Renewable-electricity-generating-capacity-per-capita'].median(),inplace=True)
df['Renewable energy share in the total final energy consumption (%)'].fillna(df['Renewable energy share in the total final energy consumption (%)'].median(),inplace=True)
df['Electricity from fossil fuels (TWh)'].fillna(df['Electricity from fossil fuels (TWh)'].median(),inplace=True)
df['Electricity from nuclear (TWh)'].fillna(df['Electricity from nuclear (TWh)'].median(),inplace=True)
df['Electricity from renewables (TWh)'].fillna(df['Electricity from renewables (TWh)'].median(),inplace=True)
df['Value_co2_emissions_kt_by_country'].fillna(df['Value_co2_emissions_kt_by_country'].median(),inplace=True)
df['gdp_growth'].fillna(df['gdp_growth'].median(),inplace=True)
df['gdp_per_capita'].fillna(df['gdp_per_capita'].median(),inplace=True)


缺失值统计：
Entity                                                                0
Year                                                                  0
Renewable-electricity-generating-capacity-per-capita                931
Renewable energy share in the total final energy consumption (%)    194
Electricity from fossil fuels (TWh)                                  21
Electricity from nuclear (TWh)                                      126
Electricity from renewables (TWh)                                    21
Primary energy consumption per capita (kWh/person)                    0
Value_co2_emissions_kt_by_country                                   428
gdp_growth                                                          317
gdp_per_capita                                                      282
dtype: int64
重复值的数量： 0


In [8]:
#异常值检测：利用IQR方法剔除异常值
def remove_outliers(df,col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3-Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

related_cols = ['Renewable-electricity-generating-capacity-per-capita','Renewable energy share in the total final energy']
for col in related_cols:
    if col in df.columns:
        df = remove_outliers(df,col)

In [9]:
#最后保存文件
df.to_csv('newfile.csv', index=False)

# 第四个数据集

In [10]:
#加载csv文件
df = pd.read_csv('E:\数据\energy.csv')

#检查该数据集的基本信息
print("数据集基本信息：")
print(df.info())
print("\n数据集统计描述：")
print(df.describe())

#查看前几行数据
print("\n数据预览：")
print(df.head())

数据集基本信息：
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55440 entries, 0 to 55439
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Unnamed: 0                   55440 non-null  int64  
 1   Country                      55440 non-null  object 
 2   Energy_type                  55440 non-null  object 
 3   Year                         55440 non-null  int64  
 4   Energy_consumption           44287 non-null  float64
 5   Energy_production            44289 non-null  float64
 6   GDP                          40026 non-null  float64
 7   Population                   46014 non-null  float64
 8   Energy_intensity_per_capita  50358 non-null  float64
 9   Energy_intensity_by_GDP      50358 non-null  float64
 10  CO2_emission                 51614 non-null  float64
dtypes: float64(7), int64(2), object(2)
memory usage: 4.7+ MB
None

数据集统计描述：
         Unnamed: 0        Year  Energy_consumption

In [11]:
#检查缺失值
print("缺失值统计：")
print(df.isnull().sum())
#检查重复值
print("重复值的数量：",df.duplicated().sum())
#处理缺失值：删除含有缺失值的行
df.dropna(inplace=True)

缺失值统计：
Unnamed: 0                         0
Country                            0
Energy_type                        0
Year                               0
Energy_consumption             11153
Energy_production              11151
GDP                            15414
Population                      9426
Energy_intensity_per_capita     5082
Energy_intensity_by_GDP         5082
CO2_emission                    3826
dtype: int64
重复值的数量： 0


In [12]:
#异常值检测：利用IQR方法剔除异常值
def remove_outliers(df,col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3-Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

related_cols = ['Energy_consumption','Energy_production','GDP','Population','Energy_intensity_per_capita','Energy_intensity_by_GDP','CO2_emission']
for col in related_cols:
    if col in df.columns:
        df = remove_outliers(df,col)


In [13]:
#最后保存文件
df.to_csv('newfile2.csv', index=False)

# 第五个数据集

In [14]:
#加载csv文件
df = pd.read_csv('E:\数据\climate_change_data.csv')

#检查该数据集的基本信息
print("数据集基本信息：")
print(df.info())
print("\n数据集统计描述：")
print(df.describe())

#查看前几行数据
print("\n数据预览：")
print(df.head())

数据集基本信息：
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Date            10000 non-null  object 
 1   Location        10000 non-null  object 
 2   Country         10000 non-null  object 
 3   Temperature     10000 non-null  float64
 4   CO2 Emissions   10000 non-null  float64
 5   Sea Level Rise  10000 non-null  float64
 6   Precipitation   10000 non-null  float64
 7   Humidity        10000 non-null  float64
 8   Wind Speed      10000 non-null  float64
dtypes: float64(6), object(3)
memory usage: 703.3+ KB
None

数据集统计描述：
        Temperature  CO2 Emissions  Sea Level Rise  Precipitation  \
count  10000.000000   10000.000000    10000.000000   10000.000000   
mean      14.936034     400.220469       -0.003152      49.881208   
std        5.030616      49.696933        0.991349      28.862417   
min       -3.803589     182.131220       -4.

In [15]:
#检查缺失值
print("缺失值统计：")
print(df.isnull().sum())
#检查重复值
print("重复值的数量：",df.duplicated().sum())
#处理缺失值：删除含有缺失值的行
df.dropna(inplace=True)

缺失值统计：
Date              0
Location          0
Country           0
Temperature       0
CO2 Emissions     0
Sea Level Rise    0
Precipitation     0
Humidity          0
Wind Speed        0
dtype: int64
重复值的数量： 0


In [16]:
#缺失值和重复值不存在不用处理

#异常值检测：利用IQR方法剔除异常值
def remove_outliers(df,col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3-Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

related_cols = ['Temperature','CO2 Emissions','Humidity','Wind Speed']
for col in related_cols:
    if col in df.columns:
        df = remove_outliers(df,col)

In [17]:
#最后保存文件
df.to_csv('newfile3.csv', index=False)