In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data =pd.read_csv('Datasets/seattleweather1948-2017_1738467795684.csv')

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.shape

# EDA

In [None]:
data.isna().sum()

In [None]:
data[data['PRCP'].isna() | data['RAIN'].isna()]

In [None]:
data.dropna(inplace=True)

In [None]:
data.isna().sum()

In [None]:
data.shape

In [None]:
def checkOutliers(data, col):
    plt.figure(figsize=(12,4))   # bigger figure

    plt.subplot(1, 2, 1)
    plt.boxplot(data[col])
    plt.title(f"Boxplot of {col}")
    plt.tight_layout(pad=3)      # space from borders

    plt.subplot(1, 2, 2)
    sns.histplot(data=data, x=col, kde=True)
    
    mean = data[col].mean()
    std = data[col].std()
    
    plt.axvline(mean + 3*std, color='red', linestyle='--')
    plt.axvline(mean - 3*std, color='red', linestyle='--')
    plt.title(f"Histogram of {col}")

    plt.tight_layout(pad=3)      # space between plots
    plt.show()

In [None]:
def handleOutliers(data,col):
    q3=data[col].quantile(0.75)
    q1=data[col].quantile(0.25)
    iqr=q3-q1
    upper=q3+(1.5*iqr)
    lower=q1-(1.5*iqr)
    data.loc[data[col]>upper,col]=upper
    data.loc[data[col]<lower,col]=lower

In [None]:
checkOutliers(data,'PRCP')
checkOutliers(data,'TMAX')
checkOutliers(data,'TMIN')

In [None]:
handleOutliers(data,'PRCP')
handleOutliers(data,'TMAX')
handleOutliers(data,'TMIN')

In [None]:
checkOutliers(data,'PRCP')
checkOutliers(data,'TMAX')
checkOutliers(data,'TMIN')

# Feature Engineering

In [None]:
data.head()

In [None]:
data=data.drop('DATE',axis=1)

In [None]:
data.head()

In [None]:
# from sklearn.preprocessing import LabelEncoder
# le=LabelEncoder()
# data['RAIN']=le.fit_transform(data['RAIN'])

# Binary encoding: Rain â†’ 1, No Rain â†’ 0
data['RAIN'] = data['RAIN'].map({True: 1, False: 0})

In [None]:
data.head()

In [None]:
plt.figure(figsize=(5,5))
sns.heatmap(data.corr(),annot=True)
plt.show()

In [None]:
y=data['RAIN']
data=data.drop(['RAIN'],axis=1)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
def calculate(data):
    vif=[]
    output=pd.DataFrame()
    output['Attribute']=data.columns
    for i in range(0,data.shape[1]):
        vif.append(variance_inflation_factor(data,i))
    output['vif']=vif
    return output

In [None]:
calculate(data)

In [None]:
data=data.drop(['TMIN'],axis=1)

In [None]:
calculate(data)

In [None]:
data.shape

In [None]:
X=data

In [None]:
X.head()
# X.describe()

In [None]:
from imblearn.over_sampling import SMOTE
sm = SMOTE()
X_sm,y_sm = sm.fit_resample(X,y)

In [None]:
y.head()
# y.value_counts()

# Splitting data into training and testing

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X_sm,y_sm,test_size=0.3,random_state=42)

In [None]:
# X_train

## Scaling

In [None]:
# from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()

# # Fit only on training data
# # X_train_scaled = scaler.fit_transform(X_train)
# X_train = scaler.fit_transform(X_train)

# # Transform test data
# # X_test_scaled = scaler.transform(X_test)
# X_test = scaler.transform(X_test)

In [None]:
print(X_train)

# Build Model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
lr=LogisticRegression()

In [None]:
result=cross_validate(lr,X_train,y_train)
lr.fit(X_train,y_train)

## Cross validate

In [None]:
from sklearn.model_selection import cross_validate

In [None]:
# lr=LogisticRegression()
val_result=cross_validate(lr,X_train,y_train)
val_result

In [None]:
val_result['test_score'].mean()

In [None]:
lr.fit(X_train,y_train)

In [None]:
test_pred=lr.predict(X_test)
# lr.predict(X_test)


In [None]:
probability= lr.predict_proba(X_test)
# lr.predict_proba(X_test)

In [None]:
# print(test_pred)
test_pred
# result['test_score'].mean()

In [None]:
probability

In [None]:
# X_test

# Check Accuracy

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report
accuracy_score(y_test,test_pred)

In [None]:
print(confusion_matrix(y_test, test_pred))


In [None]:
print(classification_report(y_test, test_pred))

In [None]:
# X.head()

In [None]:
lr.predict([[0.25,51.0]])
# new_data = pd.DataFrame({
#     'PRCP': [0.25],
#     'TMAX': [51.0]
# })

# new_data_scaled = scaler.transform(new_data)

# lr.predict(new_data_scaled)


In [None]:
# lr.predict_proba(new_data_scaled)

lr.predict_proba([[0.25,51.0]])

In [None]:
y_test.iloc[0]

In [None]:
test_pred[0]

In [None]:
print(probability[0])

In [None]:
y_test.iloc[6666]

In [None]:
test_pred[6666]

In [None]:
probability[6666]

In [None]:
# X_test

In [None]:
test_pred

In [None]:
train_pred=lr.predict(X_train)
accuracy_score(y_train,train_pred)