In [1]:
# Import necessary Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Load the data

In [4]:
data = pd.read_csv(r"https://raw.githubusercontent.com/edyoda/DS31032023/main/Machine%20Learning/September/4-9-2023/play_tennis.csv")
data.head()

Unnamed: 0,day,outlook,temp,humidity,wind,play
0,D1,Sunny,Hot,High,Weak,No
1,D2,Sunny,Hot,High,Strong,No
2,D3,Overcast,Hot,High,Weak,Yes
3,D4,Rain,Mild,High,Weak,Yes
4,D5,Rain,Cool,Normal,Weak,Yes


In [5]:
data.shape

(14, 6)

In [6]:
data.columns

Index(['day', 'outlook', 'temp', 'humidity', 'wind', 'play'], dtype='object')

# Observations
- The data contains 14 rows and 6 columns.
- play column is the target variable for this data.

## Data cleaning, data wrangling, data preprocessing

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   day       14 non-null     object
 1   outlook   14 non-null     object
 2   temp      14 non-null     object
 3   humidity  14 non-null     object
 4   wind      14 non-null     object
 5   play      14 non-null     object
dtypes: object(6)
memory usage: 800.0+ bytes


In [8]:
data.isnull().sum()

day         0
outlook     0
temp        0
humidity    0
wind        0
play        0
dtype: int64

In [9]:
data = data.drop('day',axis=1)
data.head()

Unnamed: 0,outlook,temp,humidity,wind,play
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes


- No missing values

# Probabilities

In [None]:
## Event A: playing tennis

In [10]:
len(data)

14

In [11]:
### Probability of playing tennis

In [12]:
data['play'].value_counts()

Yes    9
No     5
Name: play, dtype: int64

In [13]:
## Static approach

p_a = 9/14
p_a

0.6428571428571429

In [16]:
## pandas approach

data_yes = data[data['play']== 'Yes']
data_yes.head()

Unnamed: 0,outlook,temp,humidity,wind,play
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes
6,Overcast,Cool,Normal,Strong,Yes
8,Sunny,Cool,Normal,Weak,Yes


In [17]:
data_yes.shape[0]

9

In [18]:
data.shape[0]

14

In [19]:
p_a = data_yes.shape[0]/data.shape[0]
p_a

0.6428571428571429

In [20]:
## Event B: not playing tennis

In [21]:
data_no = data[data['play'] == 'No']
data_no

Unnamed: 0,outlook,temp,humidity,wind,play
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
5,Rain,Cool,Normal,Strong,No
7,Sunny,Mild,High,Weak,No
13,Rain,Mild,High,Strong,No


In [22]:
data_no.shape[0]

5

In [23]:
data.shape[0]

14

In [24]:
p_b = data_no.shape[0]/data.shape[0]
p_b

0.35714285714285715

In [25]:
p_b = 1-(data_yes.shape[0]/data.shape[0])
p_b

0.3571428571428571

# observations
- The probability of playing tennis is 64.28%.
- The chance of not playing tennis is 35.71%.

In [26]:
# Event c: Probability of sunny outlook and playing tennis.

In [27]:
data_sopt = data[(data['outlook'] == 'Sunny')&(data['play'] == 'Yes')]
data_sopt

Unnamed: 0,outlook,temp,humidity,wind,play
8,Sunny,Cool,Normal,Weak,Yes
10,Sunny,Mild,Normal,Strong,Yes


In [28]:
data_sopt.shape[0]

2

In [29]:
data.shape[0]

14

In [30]:
p_c = data_sopt.shape[0]/data.shape[0]
p_c

0.14285714285714285

# Simpler way to find the probability of plaing tennis with different temp conditions

In [31]:
pd.crosstab(index = data['temp'], columns = data['play'], margins = True)

play,No,Yes,All
temp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Cool,1,3,4
Hot,2,2,4
Mild,2,4,6
All,5,9,14


## Event D: Probability of playing tennis given that the temp is mild.

In [33]:
data_mild = data[data['temp'] == 'Mild']
data_mild

Unnamed: 0,outlook,temp,humidity,wind,play
3,Rain,Mild,High,Weak,Yes
7,Sunny,Mild,High,Weak,No
9,Rain,Mild,Normal,Weak,Yes
10,Sunny,Mild,Normal,Strong,Yes
11,Overcast,Mild,High,Strong,Yes
13,Rain,Mild,High,Strong,No


In [34]:
data_mild_yes = data_mild[data_mild['play'] == 'Yes']
data_mild_yes

Unnamed: 0,outlook,temp,humidity,wind,play
3,Rain,Mild,High,Weak,Yes
9,Rain,Mild,Normal,Weak,Yes
10,Sunny,Mild,Normal,Strong,Yes
11,Overcast,Mild,High,Strong,Yes


In [35]:
p_d = data_mild_yes.shape[0]/data_mild.shape[0]

In [36]:
p_d

0.6666666666666666

If the temp is mild, the chance of playing tennis is 66.66%.

# Feature Encoding

In [40]:
data['wind'].unique()

array(['Weak', 'Strong'], dtype=object)

In [43]:
dic1 = {'Sunny':2, 'Overcast':1, 'Rain':0}
dic2 = {'Hot':0, 'Mild':1, 'Cool':2}
dic3 = {'High':1, 'Normal':0}
dic4 = {'Weak':0, 'Strong':1}
dic5 = {'Yes':1,'No':0}

In [44]:
data['outlook'] = data['outlook'].replace(dic1)
data['temp'] = data['temp'].replace(dic2)
data['humidity'] = data['humidity'].replace(dic3)
data['wind'] = data['wind'].replace(dic4)
data['play'] = data['play'].replace(dic5)
data.head()

Unnamed: 0,outlook,temp,humidity,wind,play
0,2,0,1,0,0
1,2,0,1,1,0
2,1,0,1,0,1
3,0,1,1,0,1
4,0,2,0,0,1


In [45]:
# separate the data into x and y

In [46]:
x = data.drop('play',axis=1)
y = data['play']

In [47]:
# split the data

In [48]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2, random_state=0)

In [49]:
# Apply Gaussian NB on the train dataset

In [56]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb

In [57]:
mnb.fit(x_train,y_train)

In [52]:
# Perform prediction on the test dataset

In [58]:
y_pred = mnb.predict(x_test)
y_pred

array([1, 1, 1], dtype=int64)

In [54]:
# evaluations

In [59]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

1.0