In [70]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

In [3]:
df = pd.read_csv("ny_temperature.csv", parse_dates=['date'])
df

Unnamed: 0,date,TimeEST,TemperatureF,Dew PointF,Humidity,Wind SpeedMPH,Conditions
0,2016-01-01,12:51 AM,42.1,25.0,51.0,4.6,Overcast
1,2016-01-01,1:51 AM,41.0,25.0,53.0,3.5,Overcast
2,2016-01-01,2:51 AM,41.0,26.1,55.0,4.6,Overcast
3,2016-01-01,3:51 AM,41.0,26.1,55.0,9.2,Overcast
4,2016-01-01,4:51 AM,39.9,26.1,58.0,10.4,Overcast
...,...,...,...,...,...,...,...
5136,2016-06-30,7:51 PM,78.1,60.1,54.0,5.8,Partly Cloudy
5137,2016-07-01,8:51 PM,77.0,61.0,58.0,3.5,Clear
5138,2016-07-01,9:51 PM,75.9,61.0,60.0,Calm,Clear
5139,2016-07-01,10:51 PM,75.9,61.0,60.0,4.6,Clear


In [4]:
#finding datatype
df.dtypes

date             datetime64[ns]
TimeEST                  object
TemperatureF            float64
Dew PointF              float64
Humidity                float64
Wind SpeedMPH            object
Conditions               object
dtype: object

In [5]:
type(df['date'][0])

pandas._libs.tslibs.timestamps.Timestamp

In [6]:
#convert time to 24 hr format
df['TimeEST'] = pd.to_datetime(df['TimeEST'], format='%I:%M %p').dt.time
df

Unnamed: 0,date,TimeEST,TemperatureF,Dew PointF,Humidity,Wind SpeedMPH,Conditions
0,2016-01-01,00:51:00,42.1,25.0,51.0,4.6,Overcast
1,2016-01-01,01:51:00,41.0,25.0,53.0,3.5,Overcast
2,2016-01-01,02:51:00,41.0,26.1,55.0,4.6,Overcast
3,2016-01-01,03:51:00,41.0,26.1,55.0,9.2,Overcast
4,2016-01-01,04:51:00,39.9,26.1,58.0,10.4,Overcast
...,...,...,...,...,...,...,...
5136,2016-06-30,19:51:00,78.1,60.1,54.0,5.8,Partly Cloudy
5137,2016-07-01,20:51:00,77.0,61.0,58.0,3.5,Clear
5138,2016-07-01,21:51:00,75.9,61.0,60.0,Calm,Clear
5139,2016-07-01,22:51:00,75.9,61.0,60.0,4.6,Clear


In [7]:
df.shape

(5141, 7)

In [8]:
df[df.isnull().any(axis=1)]

Unnamed: 0,date,TimeEST,TemperatureF,Dew PointF,Humidity,Wind SpeedMPH,Conditions
2715,2016-04-04,09:00:00,-9999.0,-9999.0,,-9999.0,Unknown
2737,2016-04-05,06:00:00,-9999.0,-9999.0,,-9999.0,Unknown
2885,2016-04-11,07:00:00,-9999.0,-9999.0,,-9999.0,Unknown
3075,2016-04-19,07:00:00,-9999.0,-9999.0,,-9999.0,Unknown


In [9]:
#drop na values as they are only 4
df = df.dropna()

In [10]:
# rename columns
new_col_name = {'date' : 'date', 'TimeEST':'time', 'TemperatureF':'temperature', 'Dew PointF':'dew', 'Humidity':'humidity', 'Wind SpeedMPH': 'windspeed', 'Conditions':'conditions'}
df = df.rename(columns=new_col_name)

In [11]:
# remove year
df['date'] = df['date'].dt.strftime('%m-%d')

In [12]:
# Extract month and year components
df['month'] = pd.to_datetime(df['date'], format='%m-%y').dt.month
df['year'] = pd.to_datetime(df['date'], format='%m-%y').dt.year
df = df.drop('date', axis=1)

In [13]:
# cyclical encoding
# Extract the hour component
df['hour'] = [t.hour for t in df['time']]

# Perform cyclical encoding using sine and cosine transformations
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

# Drop the original 'Time' and 'Hour' columns
df = df.drop(['time', 'hour'], axis=1)
df

Unnamed: 0,temperature,dew,humidity,windspeed,conditions,month,year,hour_sin,hour_cos
0,42.1,25.0,51.0,4.6,Overcast,1,2001,0.000000,1.000000
1,41.0,25.0,53.0,3.5,Overcast,1,2001,0.258819,0.965926
2,41.0,26.1,55.0,4.6,Overcast,1,2001,0.500000,0.866025
3,41.0,26.1,55.0,9.2,Overcast,1,2001,0.707107,0.707107
4,39.9,26.1,58.0,10.4,Overcast,1,2001,0.866025,0.500000
...,...,...,...,...,...,...,...,...,...
5136,78.1,60.1,54.0,5.8,Partly Cloudy,6,2030,-0.965926,0.258819
5137,77.0,61.0,58.0,3.5,Clear,7,2001,-0.866025,0.500000
5138,75.9,61.0,60.0,Calm,Clear,7,2001,-0.707107,0.707107
5139,75.9,61.0,60.0,4.6,Clear,7,2001,-0.500000,0.866025


In [14]:
df.corr()

  df.corr()


Unnamed: 0,temperature,dew,humidity,month,year,hour_sin,hour_cos
temperature,1.0,0.825459,-0.020371,0.770696,0.160144,-0.142039,-0.121637
dew,0.825459,1.0,0.535846,0.695541,0.09544,0.021687,0.01495
humidity,-0.020371,0.535846,1.0,0.087038,-0.080206,0.234376,0.172879
month,0.770696,0.695541,0.087038,1.0,-0.02924,0.003634,0.004735
year,0.160144,0.09544,-0.080206,-0.02924,1.0,-0.005676,0.037565
hour_sin,-0.142039,0.021687,0.234376,0.003634,-0.005676,1.0,-5.8e-05
hour_cos,-0.121637,0.01495,0.172879,0.004735,0.037565,-5.8e-05,1.0


In [15]:
df['windspeed'].value_counts()

Calm       750
3.5        675
4.6        657
5.8        611
6.9        542
8.1        422
9.2        337
10.4       236
-9999.0    223
11.5       169
-9999      153
12.7       114
13.8        87
16.1        46
15          34
17.3        18
18.4        18
15.0        18
19.6        12
20.7         6
23           4
21.9         2
25.3         1
85.2         1
74.8         1
Name: windspeed, dtype: int64

In [16]:
df['humidity'].value_counts()

89.0    211
93.0    186
50.0    136
86.0    133
90.0    123
       ... 
10.0      2
15.0      2
11.0      1
9.0       1
98.0      1
Name: humidity, Length: 88, dtype: int64

In [17]:
df['temperature'].value_counts()

50.0    165
52.0    151
53.1    126
45.0    121
39.0    117
       ... 
63.1      1
63.7      1
61.9      1
61.5      1
75.2      1
Name: temperature, Length: 129, dtype: int64

In [18]:
df['dew'].value_counts()

43.0    132
30.0    122
46.9    115
39.0    110
48.0    108
       ... 
50.2      1
50.4      1
47.8      1
60.8      1
69.8      1
Name: dew, Length: 132, dtype: int64

In [19]:
# replace ambiguous values
df = df.replace(['Calm', -9999, -9999.0, '-9999', '-9999.0' ], np.NaN)

In [20]:
# windspeed has null values filled by us, to interpolate
df[df.isnull().any(axis=1)]


Unnamed: 0,temperature,dew,humidity,windspeed,conditions,month,year,hour_sin,hour_cos
6,39.0,26.1,60.0,,Overcast,1,2001,1.000000e+00,6.123234e-17
15,39.0,19.9,46.0,,Overcast,1,2001,-7.071068e-01,-7.071068e-01
21,35.1,19.9,54.0,,Clear,1,2002,-7.071068e-01,7.071068e-01
30,33.1,18.0,54.0,,Scattered Clouds,1,2002,1.000000e+00,6.123234e-17
42,36.0,17.1,46.0,,Clear,1,2002,-1.000000e+00,-1.836970e-16
...,...,...,...,...,...,...,...,...,...
5129,82.9,55.0,38.0,,Clear,6,2030,1.224647e-16,-1.000000e+00
5130,84.0,55.9,38.0,,Partly Cloudy,6,2030,-2.588190e-01,-9.659258e-01
5131,82.9,53.1,36.0,,Mostly Cloudy,6,2030,-5.000000e-01,-8.660254e-01
5133,84.0,55.0,37.0,,Clear,6,2030,-8.660254e-01,-5.000000e-01


In [21]:
df.isnull().sum()

temperature       0
dew               0
humidity          0
windspeed      1126
conditions        0
month             0
year              0
hour_sin          0
hour_cos          0
dtype: int64

In [22]:
type(df['windspeed'][0])

str

In [23]:
# change datatype from str to float
df['windspeed'] = df['windspeed'].astype(float)

In [24]:
df['windspeed']=df['windspeed'].interpolate()
df.isnull().sum()

temperature    0
dew            0
humidity       0
windspeed      0
conditions     0
month          0
year           0
hour_sin       0
hour_cos       0
dtype: int64

In [25]:
df['windspeed'].value_counts()

3.500000    931
4.600000    698
5.800000    657
6.900000    548
8.100000    426
           ... 
5.315909      1
5.355682      1
5.395455      1
5.435227      1
5.980000      1
Name: windspeed, Length: 262, dtype: int64

In [26]:
df.corr()

  df.corr()


Unnamed: 0,temperature,dew,humidity,windspeed,month,year,hour_sin,hour_cos
temperature,1.0,0.825459,-0.020371,-0.292819,0.770696,0.160144,-0.142039,-0.121637
dew,0.825459,1.0,0.535846,-0.265075,0.695541,0.09544,0.021687,0.01495
humidity,-0.020371,0.535846,1.0,-0.003103,0.087038,-0.080206,0.234376,0.172879
windspeed,-0.292819,-0.265075,-0.003103,1.0,-0.274687,-0.020168,-0.0397,-0.121211
month,0.770696,0.695541,0.087038,-0.274687,1.0,-0.02924,0.003634,0.004735
year,0.160144,0.09544,-0.080206,-0.020168,-0.02924,1.0,-0.005676,0.037565
hour_sin,-0.142039,0.021687,0.234376,-0.0397,0.003634,-0.005676,1.0,-5.8e-05
hour_cos,-0.121637,0.01495,0.172879,-0.121211,0.004735,0.037565,-5.8e-05,1.0


In [27]:
# shuffle data 
# df = sampled_df.copy()
df = df.sample(frac = 1, random_state = 42).reset_index(drop = True)
df

Unnamed: 0,temperature,dew,humidity,windspeed,conditions,month,year,hour_sin,hour_cos
0,50.0,36.0,59.0,6.900000,Clear,2,2001,9.659258e-01,0.258819
1,30.9,28.0,89.0,6.350000,Light Snow,2,2005,7.071068e-01,-0.707107
2,73.9,39.0,28.0,9.200000,Clear,5,2020,1.224647e-16,-1.000000
3,37.0,17.1,44.0,4.600000,Clear,1,2030,-2.588190e-01,-0.965926
4,63.1,56.5,79.0,5.276136,Unknown,5,2013,5.000000e-01,-0.866025
...,...,...,...,...,...,...,...,...,...
5132,66.9,62.1,84.0,4.600000,Overcast,6,2005,9.659258e-01,-0.258819
5133,30.9,23.0,72.0,4.600000,Clear,1,2018,0.000000e+00,1.000000
5134,51.1,24.1,35.0,5.800000,Clear,4,2020,8.660254e-01,0.500000
5135,66.9,35.1,31.0,3.500000,Clear,5,2011,5.000000e-01,-0.866025


In [28]:
# standardize the data
columns_to_standardize = ['dew', 'month', 'windspeed']
scaler = StandardScaler()
df[columns_to_standardize] = scaler.fit_transform(df[columns_to_standardize])
df

Unnamed: 0,temperature,dew,humidity,windspeed,conditions,month,year,hour_sin,hour_cos
0,50.0,0.060378,59.0,0.026161,Clear,-0.872550,2001,9.659258e-01,0.258819
1,30.9,-0.394122,89.0,-0.127650,Light Snow,-0.872550,2005,7.071068e-01,-0.707107
2,73.9,0.230816,28.0,0.669370,Clear,0.881084,2020,1.224647e-16,-1.000000
3,37.0,-1.013379,44.0,-0.617048,Clear,-1.457095,2030,-2.588190e-01,-0.965926
4,63.1,1.225035,79.0,-0.427962,Unknown,0.881084,2013,5.000000e-01,-0.866025
...,...,...,...,...,...,...,...,...,...
5132,66.9,1.543185,84.0,-0.617048,Overcast,1.465629,2005,9.659258e-01,-0.258819
5133,30.9,-0.678185,72.0,-0.617048,Clear,-1.457095,2018,0.000000e+00,1.000000
5134,51.1,-0.615691,35.0,-0.281461,Clear,0.296540,2020,8.660254e-01,0.500000
5135,66.9,0.009247,31.0,-0.924669,Clear,0.881084,2011,5.000000e-01,-0.866025


In [29]:
correlation_matrix = df.corr()
correlation_matrix

  correlation_matrix = df.corr()


Unnamed: 0,temperature,dew,humidity,windspeed,month,year,hour_sin,hour_cos
temperature,1.0,0.825459,-0.020371,-0.292819,0.770696,0.160144,-0.142039,-0.121637
dew,0.825459,1.0,0.535846,-0.265075,0.695541,0.09544,0.021687,0.01495
humidity,-0.020371,0.535846,1.0,-0.003103,0.087038,-0.080206,0.234376,0.172879
windspeed,-0.292819,-0.265075,-0.003103,1.0,-0.274687,-0.020168,-0.0397,-0.121211
month,0.770696,0.695541,0.087038,-0.274687,1.0,-0.02924,0.003634,0.004735
year,0.160144,0.09544,-0.080206,-0.020168,-0.02924,1.0,-0.005676,0.037565
hour_sin,-0.142039,0.021687,0.234376,-0.0397,0.003634,-0.005676,1.0,-5.8e-05
hour_cos,-0.121637,0.01495,0.172879,-0.121211,0.004735,0.037565,-5.8e-05,1.0


In [30]:
temp = df['temperature']
df = df.drop('temperature', axis = 1)
df['temperature'] = temp
df

Unnamed: 0,dew,humidity,windspeed,conditions,month,year,hour_sin,hour_cos,temperature
0,0.060378,59.0,0.026161,Clear,-0.872550,2001,9.659258e-01,0.258819,50.0
1,-0.394122,89.0,-0.127650,Light Snow,-0.872550,2005,7.071068e-01,-0.707107,30.9
2,0.230816,28.0,0.669370,Clear,0.881084,2020,1.224647e-16,-1.000000,73.9
3,-1.013379,44.0,-0.617048,Clear,-1.457095,2030,-2.588190e-01,-0.965926,37.0
4,1.225035,79.0,-0.427962,Unknown,0.881084,2013,5.000000e-01,-0.866025,63.1
...,...,...,...,...,...,...,...,...,...
5132,1.543185,84.0,-0.617048,Overcast,1.465629,2005,9.659258e-01,-0.258819,66.9
5133,-0.678185,72.0,-0.617048,Clear,-1.457095,2018,0.000000e+00,1.000000,30.9
5134,-0.615691,35.0,-0.281461,Clear,0.296540,2020,8.660254e-01,0.500000,51.1
5135,0.009247,31.0,-0.924669,Clear,0.881084,2011,5.000000e-01,-0.866025,66.9


In [31]:
X = df[['month', 'humidity', 'dew']]
y = df['temperature']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

In [32]:
df

Unnamed: 0,dew,humidity,windspeed,conditions,month,year,hour_sin,hour_cos,temperature
0,0.060378,59.0,0.026161,Clear,-0.872550,2001,9.659258e-01,0.258819,50.0
1,-0.394122,89.0,-0.127650,Light Snow,-0.872550,2005,7.071068e-01,-0.707107,30.9
2,0.230816,28.0,0.669370,Clear,0.881084,2020,1.224647e-16,-1.000000,73.9
3,-1.013379,44.0,-0.617048,Clear,-1.457095,2030,-2.588190e-01,-0.965926,37.0
4,1.225035,79.0,-0.427962,Unknown,0.881084,2013,5.000000e-01,-0.866025,63.1
...,...,...,...,...,...,...,...,...,...
5132,1.543185,84.0,-0.617048,Overcast,1.465629,2005,9.659258e-01,-0.258819,66.9
5133,-0.678185,72.0,-0.617048,Clear,-1.457095,2018,0.000000e+00,1.000000,30.9
5134,-0.615691,35.0,-0.281461,Clear,0.296540,2020,8.660254e-01,0.500000,51.1
5135,0.009247,31.0,-0.924669,Clear,0.881084,2011,5.000000e-01,-0.866025,66.9


In [33]:
print("X_train shape", X_train.shape)
print("X_test shape", X_test.shape)
print("y_train shape", y_train.shape)
print("y_test shape", y_test.shape)

X_train shape (3595, 3)
X_test shape (1542, 3)
y_train shape (3595,)
y_test shape (1542,)


In [51]:
# # model building
# SVM_model = SVC()
# SVM_model.fit(X_train, y_train)

# # Make predictions on the test set
# y_pred_SVM = SVM_model.predict(X_test)


# Create an SVR model
svr = SVR(kernel='rbf', C=10, gamma=0.1)

# Train the model
svr.fit(X_train, y_train)

# Make predictions on the test data
y_pred_svr = svr.predict(X_test)

In [80]:
# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred_svr)
print("Mean Squared Error:", mse)
rmse = np.sqrt(mse)
print("RMSE: ", rmse)

Mean Squared Error: 0.41325488147718054
RMSE:  0.6428490347485796


In [69]:
# Create a decision tree regressor
regressor = DecisionTreeRegressor(max_depth=10, min_samples_split=2, max_features=1.0)
regressor.fit(X_train, y_train)

# predict
y_pred_reg = regressor.predict(X_test)

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred_reg)
print("Mean Squared Error:", mse)

Mean Squared Error: 1.214467717746944


In [78]:
# Create a Random Forest Regression model
regRFR = RandomForestRegressor(n_estimators=50, max_depth=10, max_features=1.0)
regRFR.fit(X_train, y_train)
y_pred_RFR = regRFR.predict(X_test)
mse = mean_squared_error(y_test, y_pred_RFR)
print("Mean squared error: ", mse)

Mean squared error:  0.46066588432167427


In [None]:
# Create a KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)

# Fit the classifier to the training data
knn.fit(X_train, y_train)

# Make predictions on the test data
y_pred_knn = knn.predict(X_test)

In [None]:
# Calculate the accuracy of the knn classifier
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print("Accuracy:", accuracy_knn)