<a href="https://colab.research.google.com/github/YanaKnudsen/basicML/blob/main/basicML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Weather prediction using Polynomial Regression


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.ticker import (AutoMinorLocator, MultipleLocator)

In [None]:
weather_data=pd.read_csv("/content/weatherHistory.csv")
weather_data

In [None]:
weather_data.describe()

In [None]:
weather_features=["Temperature (C)","Apparent Temperature (C)","Wind Speed (km/h)","Wind Bearing (degrees)","Visibility (km)","Pressure (millibars)"]
X=weather_data[weather_features]
X

In [None]:
Y=weather_data.Humidity
Y

In [None]:
ax=plt.subplot(4,2,1)
plt.scatter(X["Temperature (C)"],Y,s=0.2)
ax.xaxis.set_minor_locator(AutoMinorLocator(10))
plt.subplot(4,2,2)
plt.scatter(X["Apparent Temperature (C)"],Y,s=0.2)
plt.subplot(4,2,3)
plt.scatter(X["Wind Speed (km/h)"],Y,s=0.2)
plt.subplot(4,2,4)
plt.scatter(X["Wind Bearing (degrees)"],Y,s=0.2)
plt.subplot(4,2,5)
plt.scatter(X["Visibility (km)"],Y,s=0.2)
plt.subplot(4,2,6)
plt.scatter(X["Pressure (millibars)"],Y,s=0.2)

In [None]:
#preprocessing data
from sklearn import preprocessing
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split

In [None]:
X_scaled=preprocessing.scale(X)
poly=PolynomialFeatures(7)
X_final=poly.fit_transform(X_scaled)

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X_final,Y,test_size=0.10,random_state=42)

In [None]:
from sklearn import linear_model
regr=linear_model.Ridge(alpha=0.5) #regression with regularization, alpha is a regularization hyperparameter
regr.fit(X_train,y_train) #our model
y_pred=regr.predict(X_test)
y_pred

In [None]:
print("Intercept:",regr.intercept_)
print("Coefficients:",regr.coef_)

In [None]:
from sklearn.metrics import mean_squared_error,r2_score
print("mean squared error: %.3f"% mean_squared_error(y_test,y_pred)) #leave only three digits after point
print("coefficient of determination: %.3f"% r2_score(y_test,y_pred))

In [None]:
weatherObs=[[32,31.4,44,344,13,1020.33]]
weatherObs_scaled=preprocessing.scale(weatherObs)
weatherObs_final=poly.fit_transform(weatherObs_scaled)

y_pred=regr.predict(weatherObs_final)
y_pred

# Rain forecast usig Logistic Regression

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
import scipy
from sklearn import preprocessing

#ignore all warnings
warnings.filterwarnings("ignore")

In [None]:
weather_data=pd.read_csv("/content/weatherAUS.csv")
weather_data.head()

In [None]:
weather_data.count().sort_values()

In [None]:
weather_data=weather_data.drop(columns=["Sunshine","Evaporation","Cloud3pm","Cloud9am","RISK_MM","Date"],axis=1)

In [None]:
weather_data=weather_data.dropna()

In [None]:
numerical=[var for var in weather_data.columns if weather_data[var].dtype=="float64"]
numerical

In [None]:
for col in numerical:
  weather_data[col]=preprocessing.scale(weather_data[col])

weather_data.head()

In [None]:
weather_data.shape

In [None]:
z=np.abs(weather_data._get_numeric_data())
weather_data=weather_data[(z<3).all(axis=1)]
weather_data.shape

In [None]:
weather_data['RainToday'].replace({'No':0,'Yes':1},inplace=True)
weather_data['RainTomorrow'].replace({'No':0,'Yes':1},inplace=True)

In [None]:
categorical=[var for var in weather_data.columns if weather_data[var].dtype=="object"]
categorical

In [None]:
categorical_columns=['Location','WindGustDir','WindDir9am','WindDir3pm']

for col in categorical_columns:
   print(np.unique(weather_data[col]))

In [None]:
#applying one hot encoding
weather_data=pd.get_dummies(weather_data,columns=categorical_columns)
weather_data.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
X=weather_data.loc[:,weather_data.columns!="RainTomorrow"]
y=weather_data.RainTomorrow
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=42)
logreg=LogisticRegression()
logreg.fit(X_train,y_train)

In [None]:
# K-fold cross validation
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from numpy import mean
cv=KFold(n_splits=10,random_state=1,shuffle=True)

scores=cross_val_score(logreg,X,y,scoring="accuracy",cv=cv)
average_score=mean(scores)

print("Overall score:",average_score)


# Iris clustering using K-means clustering

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
iris_data=pd.read_csv("/content/IrisData.csv")
iris_data
iris_data.Species.unique()

In [None]:
sns.scatterplot(data=iris_data,x="SepalLengthCm",y="PetalWidthCm",hue=iris_data.Species,palette="coolwarm_r")

In [None]:
from sklearn.cluster import KMeans

In [None]:
X=iris_data[["SepalLengthCm","PetalWidthCm"]]
km=KMeans(n_clusters=3,n_init=3,init="random",random_state=42)
km.fit(X)
y_kmeans=km.predict(X)
y_kmeans

In [None]:
sns.scatterplot(data=iris_data,x="SepalLengthCm",y="PetalWidthCm",hue=y_kmeans,palette="coolwarm_r")

centers=km.cluster_centers_

plt.scatter(centers[:,0],centers[:,1],c='black',alpha=0.6)

0:Iris-versicolor
1:Iris-setosa
2:Iris-virginica

In [None]:
km.inertia_

In [None]:
newData=[[4.7,0.8]]
y_pred=km.predict(newData)
y_pred

In [None]:
sns.scatterplot(data=iris_data,x="SepalLengthCm",y="PetalWidthCm")

In [None]:
inertia=[]
K=range(1,15)
for k in K:
  km=KMeans(n_clusters=k)
  km=km.fit(X)
  inertia.append(km.inertia_)

plt.plot(K,inertia,marker="x")
plt.xlabel('k')
plt.ylabel('Inertia')
plt.title('Elbow method')
plt.show()
#number of clusters is 3

# SVM for pulsar classification

In [None]:
import pandas as pd
import warnings
import numpy as np

warnings.filterwarnings("ignore")
star_data=pd.read_csv("/content/pulsar_data.csv")
star_data.head()

In [None]:
star_data.dropna(inplace=True)
star_data.isnull().sum()

In [None]:
star_data.dtypes

In [None]:
star_data.columns=star_data.columns.str.strip()

star_data.columns

In [None]:
X=star_data.drop('target_class',1)
y=star_data['target_class']

In [None]:
#scale data
from sklearn.preprocessing import StandardScaler

s_scaler=StandardScaler()
X_sc=pd.DataFrame(s_scaler.fit_transform(X),columns=X.columns)

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X_sc,y,test_size=0.25,random_state=42)

In [None]:
from sklearn import svm

clf_linear=svm.SVC(kernel='linear',C=10)
clf_linear.fit(X_train,y_train)
#evaluating results
from sklearn.metrics import f1_score

y_pred=clf_linear.predict(X_test)
f1_score(y_test,y_pred)

#nonlinear svm
clf_rbf=svm.SVC(kernel='rbf',C=10)
clf_rbf.fit(X_train,y_train)
y_pred_rbf=clf_rbf.predict(X_test)
f1_score(y_test,y_pred_rbf)

# Agglomerative Hierarchical Clustering

In [None]:
iris_data=pd.read_csv("/content/IrisData.csv")
iris_data.head()

In [None]:
X=iris_data[["SepalLengthCm","PetalLengthCm","PetalWidthCm"]]
X.shape

In [None]:
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.patches as mpatches
fig=plt.figure(figsize=(5,5))
ax=fig.add_subplot(111,projection="3d")

#variables
x=X.iloc[:,0]
y=X.iloc[:,1]
z=X.iloc[:,2]
# color-code species
colors = {'Iris-setosa':'orange', 'Iris-versicolor':'grey', 'Iris-virginica':'lightblue'}
#legend
orange_patch=mpatches.Patch(color='orange',label='Iris-setosa')
grey_patch=mpatches.Patch(color='grey',label='Iris-versicolor')
lightblue_patch=mpatches.Patch(color='lightblue',label='Iris-virginica')
ax.legend(handles=[orange_patch,grey_patch,lightblue_patch])

plt.title("Iris plants")

ax.scatter(x,y,z,s=40,c=iris_data["Species"].map(colors),marker='o',alpha=1)
ax.set_xlabel("SepalLengthCm")
ax.set_ylabel("PetalLengthCm")
ax.set_zlabel("PetalWidthCm")

In [None]:
from sklearn.cluster import AgglomerativeClustering
hc=AgglomerativeClustering(n_clusters=3,linkage="ward")
hc=hc.fit(X)
hc.labels_

In [None]:
from matplotlib.colors import ListedColormap
fig=plt.figure(figsize=(5,5))
ax=fig.add_subplot(111,projection="3d")

cmap=ListedColormap(["orangered","lightgreen","deepskyblue"])


#variables
x=X.iloc[:,0]
y=X.iloc[:,1]
z=X.iloc[:,2]
sc=ax.scatter(x,y,z,s=40,c=hc.labels_,cmap=cmap,marker='o',alpha=1)
plt.legend(*sc.legend_elements())
plt.title("Hierarchical Clustering")

ax.set_xlabel("SepalLengthCm")
ax.set_ylabel("PetalLengthCm")
ax.set_zlabel("PetalWidthCm")

In [None]:
from scipy.cluster.hierarchy import dendrogram,linkage

In [None]:
#define linkage
linkage=linkage(X.sample(n=20,random_state=1),'ward')
#we produce only for 20 datapoint because it will be difficult to read if we produce for all

plt.figure(figsize=(7,5))

dendrogram(linkage,orientation='top')
plt.title("Dendrogram")
plt.ylabel("Dissimalirty")
plt.xlabel("Data point")