In [122]:
import pandas as pd
import numpy as np

data=pd.read_csv("/content/weather.csv")
print(data.head())

   Temperature  Humidity  Wind Speed  Precipitation (%)    Cloud Cover  \
0           14        73         9.5                 82  partly cloudy   
1           39        96         8.5                 71  partly cloudy   
2           30        64         7.0                 16          clear   
3           38        83         1.5                 82          clear   
4           27        74        17.0                 66       overcast   

   Atmospheric Pressure  UV Index  Season  Visibility (km)  Location  \
0               1010.82         2  Winter              3.5    inland   
1               1011.43         7  Spring             10.0    inland   
2               1018.72         5  Spring              5.5  mountain   
3               1026.25         7  Spring              1.0   coastal   
4                990.67         1  Winter              2.5  mountain   

  Weather Type  
0        Rainy  
1       Cloudy  
2        Sunny  
3        Sunny  
4        Rainy  


In [123]:
#data information
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13200 entries, 0 to 13199
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Temperature           13200 non-null  int64  
 1   Humidity              13200 non-null  int64  
 2   Wind Speed            13200 non-null  float64
 3   Precipitation (%)     13200 non-null  int64  
 4   Cloud Cover           13200 non-null  object 
 5   Atmospheric Pressure  13200 non-null  float64
 6   UV Index              13200 non-null  int64  
 7   Season                13200 non-null  object 
 8   Visibility (km)       13200 non-null  float64
 9   Location              13200 non-null  object 
 10  Weather Type          13200 non-null  object 
dtypes: float64(3), int64(4), object(4)
memory usage: 1.1+ MB


In [124]:
data.isnull().sum()

Unnamed: 0,0
Temperature,0
Humidity,0
Wind Speed,0
Precipitation (%),0
Cloud Cover,0
Atmospheric Pressure,0
UV Index,0
Season,0
Visibility (km),0
Location,0


In [125]:
#check duplication
duplicate_rows = data[data.duplicated()]
print(duplicate_rows)

Empty DataFrame
Columns: [Temperature, Humidity, Wind Speed, Precipitation (%), Cloud Cover, Atmospheric Pressure, UV Index, Season, Visibility (km), Location, Weather Type]
Index: []


In [126]:
#drop duplication
data_cleaned = data.drop_duplicates()

In [127]:
#Remove outlayout(z_score)

from scipy.stats import zscore

numeric_cols = data_cleaned.select_dtypes(include=[np.number]).columns
z_scores = zscore(data[numeric_cols])
z_scores_df = pd.DataFrame(z_scores, columns=numeric_cols)
outliers = (np.abs(z_scores_df) > 3).any(axis=1)
data_cleaned = data[~outliers]
data_cleaned.reset_index(drop=True, inplace=True)

print(data_cleaned.head())
print("Original shape:", data.shape)
print("Cleaned shape:", data_cleaned.shape)

   Temperature  Humidity  Wind Speed  Precipitation (%)    Cloud Cover  \
0           14        73         9.5                 82  partly cloudy   
1           39        96         8.5                 71  partly cloudy   
2           30        64         7.0                 16          clear   
3           38        83         1.5                 82          clear   
4           27        74        17.0                 66       overcast   

   Atmospheric Pressure  UV Index  Season  Visibility (km)  Location  \
0               1010.82         2  Winter              3.5    inland   
1               1011.43         7  Spring             10.0    inland   
2               1018.72         5  Spring              5.5  mountain   
3               1026.25         7  Spring              1.0   coastal   
4                990.67         1  Winter              2.5  mountain   

  Weather Type  
0        Rainy  
1       Cloudy  
2        Sunny  
3        Sunny  
4        Rainy  
Original shape: (132

In [128]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data_cleaned.loc[:, "Cloud Cover"] = le.fit_transform(data_cleaned["Cloud Cover"])
data_cleaned.loc[:, "Season"] = le.fit_transform(data_cleaned["Season"])
data_cleaned.loc[:, "Location"] = le.fit_transform(data_cleaned["Location"])
data_cleaned.loc[:, "Weather Type"] = le.fit_transform(data_cleaned["Weather Type"])

data_cleaned.loc[:, "Wind Speed"] = data_cleaned["Wind Speed"].astype(int)
data_cleaned.loc[:, "Atmospheric Pressure"] = data_cleaned["Atmospheric Pressure"].astype(int)
data_cleaned.loc[:, "Visibility (km)"] = data_cleaned["Visibility (km)"].astype(int)

#y_encoded = le.fit_transform(y)

print(data_cleaned.head())

   Temperature  Humidity  Wind Speed  Precipitation (%) Cloud Cover  \
0           14        73         9.0                 82           3   
1           39        96         8.0                 71           3   
2           30        64         7.0                 16           0   
3           38        83         1.0                 82           0   
4           27        74        17.0                 66           2   

   Atmospheric Pressure  UV Index Season  Visibility (km) Location  \
0                1010.0         2      3              3.0        1   
1                1011.0         7      1             10.0        1   
2                1018.0         5      1              5.0        2   
3                1026.0         7      1              1.0        0   
4                 990.0         1      3              2.0        2   

  Weather Type  
0            1  
1            0  
2            3  
3            3  
4            1  


In [129]:
x=data_cleaned.drop("Weather Type",axis=1)
y=data_cleaned["Weather Type"]
print(x.head())
print(y.head())

   Temperature  Humidity  Wind Speed  Precipitation (%) Cloud Cover  \
0           14        73         9.0                 82           3   
1           39        96         8.0                 71           3   
2           30        64         7.0                 16           0   
3           38        83         1.0                 82           0   
4           27        74        17.0                 66           2   

   Atmospheric Pressure  UV Index Season  Visibility (km) Location  
0                1010.0         2      3              3.0        1  
1                1011.0         7      1             10.0        1  
2                1018.0         5      1              5.0        2  
3                1026.0         7      1              1.0        0  
4                 990.0         1      3              2.0        2  
0    1
1    0
2    3
3    3
4    1
Name: Weather Type, dtype: object


In [130]:


#anove
from sklearn.feature_selection import SelectKBest, f_classif
numeric_col_x = x.select_dtypes(include='number').columns

selector = SelectKBest(score_func=f_classif, k=4)
selector.fit(x[numeric_col_x], y)

selected_cols = numeric_col_x[selector.get_support()]
x_anova = x[selected_cols]

x_anova.head()

Unnamed: 0,Temperature,Precipitation (%),UV Index,Visibility (km)
0,14,82,2,3.0
1,39,71,7,10.0
2,30,16,5,5.0
3,38,82,7,1.0
4,27,66,1,2.0


In [131]:
#chi_square
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder
import pandas as pd

x = data.drop("Weather Type", axis=1)
y = data["Weather Type"]

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

categorical_cols = x.select_dtypes(include="number").columns
x_categorical = x[categorical_cols]
x_categorical_clipped = x_categorical.clip(lower=0)

selector = SelectKBest(score_func=chi2, k=4)
selector.fit(x_categorical_clipped, y_encoded)

selected_cols = categorical_cols[selector.get_support()]
x_chi2 = x[selected_cols]

print(x_chi2.head())
# Convert y_encoded to a pandas Series before calling head()
print('y:', pd.Series(y_encoded).head())

   Temperature  Humidity  Precipitation (%)  UV Index
0           14        73                 82         2
1           39        96                 71         7
2           30        64                 16         5
3           38        83                 82         7
4           27        74                 66         1
y: 0    1
1    0
2    3
3    3
4    1
dtype: int64


In [132]:
#create new column withh chi2 and anova
x_final = pd.concat([x_anova, x_chi2], axis=1)
x_final = x_final.loc[:, ~x_final.columns.duplicated()]
print(x_final.head())

   Temperature  Precipitation (%)  UV Index  Visibility (km)  Humidity
0         14.0               82.0       2.0              3.0        73
1         39.0               71.0       7.0             10.0        96
2         30.0               16.0       5.0              5.0        64
3         38.0               82.0       7.0              1.0        83
4         27.0               66.0       1.0              2.0        74


In [133]:
x_final.columns

Index(['Temperature', 'Precipitation (%)', 'UV Index', 'Visibility (km)',
       'Humidity'],
      dtype='object')

In [134]:
#skew normalization

from sklearn.preprocessing import PowerTransformer

pt = PowerTransformer()
cols = ['Temperature', 'Precipitation (%)', 'UV Index', 'Visibility (km)','Humidity']
x_final[cols] = pt.fit_transform(x_final[cols])
skew_values = x_final[cols].skew()

print(skew_values)
print(x_final.head())


Temperature         -0.242064
Precipitation (%)   -0.339881
UV Index             0.001928
Visibility (km)     -0.068493
Humidity            -0.117439
dtype: float64
   Temperature  Precipitation (%)  UV Index  Visibility (km)  Humidity
0    -0.268215           0.901354 -0.173423        -0.619193  0.159273
1     1.204739           0.615011  1.028920         1.652357  1.433830
2     0.681969          -1.118828  0.677054         0.156605 -0.300035
3     1.146963           0.901354  1.028920        -1.630853  0.696471
4     0.506149           0.480641 -0.672682        -1.081845  0.211755


In [135]:
#scalling
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()

# Select only numeric columns for scaling
numeric_cols = x_final.select_dtypes(include=np.number).columns
scaled_data = scaler.fit_transform(data_cleaned[numeric_cols])

scaled_data=pd.DataFrame(scaled_data,columns=numeric_cols)
print(scaled_data.head())

   Temperature  Precipitation (%)  UV Index  Visibility (km)  Humidity
0     0.406250           0.752294  0.142857         0.200000  0.595506
1     0.666667           0.651376  0.500000         0.666667  0.853933
2     0.572917           0.146789  0.357143         0.333333  0.494382
3     0.656250           0.752294  0.500000         0.066667  0.707865
4     0.541667           0.605505  0.071429         0.133333  0.606742


In [136]:
#i have joined x_final and y
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(data_cleaned["Weather Type"])
y = le.transform(data_cleaned["Weather Type"])
y = pd.Series(y, name="Weather Type")  # convert to Series so you can use .head()

print(y.head())

new_data = pd.concat([scaled_data, y], axis=1)
print(new_data.head())
print(new_data.shape)


0    1
1    0
2    3
3    3
4    1
Name: Weather Type, dtype: int64
   Temperature  Precipitation (%)  UV Index  Visibility (km)  Humidity  \
0     0.406250           0.752294  0.142857         0.200000  0.595506   
1     0.666667           0.651376  0.500000         0.666667  0.853933   
2     0.572917           0.146789  0.357143         0.333333  0.494382   
3     0.656250           0.752294  0.500000         0.066667  0.707865   
4     0.541667           0.605505  0.071429         0.133333  0.606742   

   Weather Type  
0             1  
1             0  
2             3  
3             3  
4             1  
(12151, 6)


In [137]:
#outliat_using_zscore
from scipy.stats import zscore
numeric_cols = new_data.select_dtypes(include=[np.number]).drop('Weather Type', axis=1).columns
z_scores = zscore(new_data[numeric_cols])
z_scores_df = pd.DataFrame(z_scores, columns=numeric_cols)
outliers = (np.abs(z_scores_df) > 3).any(axis=1)
new_data = new_data[~outliers].reset_index(drop=True)

print("new_data shape after Z-score outlier removal:", new_data.shape)

print(new_data.head())


new_data shape after Z-score outlier removal: (12061, 6)
   Temperature  Precipitation (%)  UV Index  Visibility (km)  Humidity  \
0     0.406250           0.752294  0.142857         0.200000  0.595506   
1     0.666667           0.651376  0.500000         0.666667  0.853933   
2     0.572917           0.146789  0.357143         0.333333  0.494382   
3     0.656250           0.752294  0.500000         0.066667  0.707865   
4     0.541667           0.605505  0.071429         0.133333  0.606742   

   Weather Type  
0             1  
1             0  
2             3  
3             3  
4             1  


In [138]:
from sklearn.model_selection import train_test_split

# Splitting features and target
x_new_1 = new_data.drop("Weather Type", axis=1)
y_new_1 = new_data["Weather Type"]

# Split the data
x_train, x_test, y_train, y_test = train_test_split(x_new_1, y_new_1, test_size=0.2, random_state=42)

# Display results
print(x_train.head())
print(y_train.head())

       Temperature  Precipitation (%)  UV Index  Visibility (km)  Humidity
3288      0.541667           0.091743  0.428571         0.600000  0.393258
9736      0.218750           0.743119  0.000000         0.066667  0.685393
6208      0.562500           0.073394  0.714286         0.600000  0.415730
10096     0.260417           0.715596  0.071429         0.200000  0.584270
10        0.656250           0.146789  0.785714         0.466667  0.258427
3288     3
9736     2
6208     3
10096    2
10       3
Name: Weather Type, dtype: int64


In [139]:
print(x_new_1.isnull().sum())

Temperature          0
Precipitation (%)    0
UV Index             0
Visibility (km)      0
Humidity             0
dtype: int64


In [146]:
print(x_train.columns)
print(data.head(3))

Index(['Temperature', 'Precipitation (%)', 'UV Index', 'Visibility (km)',
       'Humidity'],
      dtype='object')
   Temperature  Humidity  Wind Speed  Precipitation (%)    Cloud Cover  \
0           14        73         9.5                 82  partly cloudy   
1           39        96         8.5                 71  partly cloudy   
2           30        64         7.0                 16          clear   

   Atmospheric Pressure  UV Index  Season  Visibility (km)  Location  \
0               1010.82         2  Winter              3.5    inland   
1               1011.43         7  Spring             10.0    inland   
2               1018.72         5  Spring              5.5  mountain   

  Weather Type  
0        Rainy  
1       Cloudy  
2        Sunny  


In [142]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(x_train, y_train)
predicted = model.predict(x_train)

In [147]:
zn=model.predict([[26,76,6,7.8,72]])
print('The Prediction is: ',zn)

The Prediction is:  [1]




In [149]:
from sklearn.metrics import top_k_accuracy_score, accuracy_score
print(accuracy_score(y_train,predicted))

0.9134535655058044
