# Initialization

In [2]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [35]:
!cp -r /content/drive/MyDrive/Academic/NASSCOM/T2/Dataset.csv /content/

In [36]:
import numpy as np
import pandas as pd

In [37]:
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import train_test_split

In [38]:
import seaborn as sn
import matplotlib.pyplot as plt

In [39]:
df = pd.read_csv("/content/Dataset.csv")

# Dataset Before

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97606 entries, 0 to 97605
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          97606 non-null  object 
 1   room_id/id  97606 non-null  object 
 2   noted_date  97606 non-null  object 
 3   temp        97592 non-null  float64
 4   out/in      97606 non-null  object 
dtypes: float64(1), object(4)
memory usage: 3.7+ MB


In [41]:
df.describe()

Unnamed: 0,temp
count,97592.0
mean,35.054451
std,5.699924
min,21.0
25%,30.0
50%,35.0
75%,40.0
max,51.0


In [43]:
df.head()

Unnamed: 0,id,room_id/id,noted_date,temp,out/in
0,__export__.temp_log_196134_bd201015,Room Admin,08-12-2018 09:30,29.0,In
1,__export__.temp_log_196131_7bca51bc,Room Admin,08-12-2018 09:30,29.0,In
2,__export__.temp_log_196127_522915e3,Room Admin,08-12-2018 09:29,41.0,Out
3,__export__.temp_log_196128_be0919cf,Room Admin,08-12-2018 09:29,41.0,Out
4,__export__.temp_log_196126_d30b72fb,Room Admin,08-12-2018 09:29,31.0,In


# Replacing in/out with 0/1

In [44]:
le = preprocessing.LabelEncoder()
df2 = le.fit_transform(df['out/in']) 
df2 = pd.DataFrame(data=df2, columns=['out/in'])

In [45]:
df2.head()

Unnamed: 0,out/in
0,0
1,0
2,1
3,1
4,0


In [46]:
df['out/in'] = df2['out/in']

In [47]:
df.head()

Unnamed: 0,id,room_id/id,noted_date,temp,out/in
0,__export__.temp_log_196134_bd201015,Room Admin,08-12-2018 09:30,29.0,0
1,__export__.temp_log_196131_7bca51bc,Room Admin,08-12-2018 09:30,29.0,0
2,__export__.temp_log_196127_522915e3,Room Admin,08-12-2018 09:29,41.0,1
3,__export__.temp_log_196128_be0919cf,Room Admin,08-12-2018 09:29,41.0,1
4,__export__.temp_log_196126_d30b72fb,Room Admin,08-12-2018 09:29,31.0,0


### **Drop the columns which are not required for prediction**

In [48]:
# Remove the 'room_id/id' column 
df = df.drop(['room_id/id'],axis=1)

In [49]:
# Irrelevant column removed
df.head()

Unnamed: 0,id,noted_date,temp,out/in
0,__export__.temp_log_196134_bd201015,08-12-2018 09:30,29.0,0
1,__export__.temp_log_196131_7bca51bc,08-12-2018 09:30,29.0,0
2,__export__.temp_log_196127_522915e3,08-12-2018 09:29,41.0,1
3,__export__.temp_log_196128_be0919cf,08-12-2018 09:29,41.0,1
4,__export__.temp_log_196126_d30b72fb,08-12-2018 09:29,31.0,0


### **Drop the rows which have duplicate values**

In [50]:
# Checking for Duplicates
df[df.duplicated()]

Unnamed: 0,id,noted_date,temp,out/in
11,__export__.temp_log_196108_4a983c7e,08-12-2018 09:25,42.0,1


In [51]:
df.drop_duplicates(inplace=True)

In [53]:
# Duplicate rows removed
df[df.duplicated()]

Unnamed: 0,id,noted_date,temp,out/in


### **Replace the missing values with median/mean**

In [54]:
# Checking for Missing values
df.isnull().sum()

id             0
noted_date     0
temp          14
out/in         0
dtype: int64

In [55]:
df[df.temp.isnull()]

Unnamed: 0,id,noted_date,temp,out/in
14,__export__.temp_log_196095_788b2c27,08-12-2018 09:22,,0
827,__export__.temp_log_193862_d4240019,07-12-2018 20:26,,0
828,__export__.temp_log_193861_f852cf83,07-12-2018 20:26,,1
1633,__export__.temp_log_191920_b3eb48a5,07-12-2018 08:48,,0
1679,__export__.temp_log_191738_756769f3,07-12-2018 07:28,,0
46009,__export__.temp_log_49946_9d2a088d,17-10-2018 07:30,,1
68184,__export__.temp_log_141680_9e776670,11-09-2018 18:41,,1
87734,__export__.temp_log_155473_61ac6a04,09-09-2018 10:39,,0
92659,__export__.temp_log_92933_6985763b,03-09-2018 20:13,,0
93484,__export__.temp_log_109846_50bf370c,03-09-2018 17:38,,1


In [56]:
df3 = df.fillna(df.temp.mean())

In [61]:
# Missing values replaced
df3.head()

Unnamed: 0,id,noted_date,temp,out/in
0,__export__.temp_log_196134_bd201015,08-12-2018 09:30,29.0,0
1,__export__.temp_log_196131_7bca51bc,08-12-2018 09:30,29.0,0
2,__export__.temp_log_196127_522915e3,08-12-2018 09:29,41.0,1
3,__export__.temp_log_196128_be0919cf,08-12-2018 09:29,41.0,1
4,__export__.temp_log_196126_d30b72fb,08-12-2018 09:29,31.0,0


In [60]:
# No Missing Values 
df3.isnull().sum()

id            0
noted_date    0
temp          0
out/in        0
dtype: int64

### Replace date string

In [63]:
df['date'] = pd.to_datetime(df['noted_date'], format='%d-%m-%Y %H:%M')
df['year'] = df['date'].apply(lambda x : x.year)
df['month'] = df['date'].apply(lambda x : x.month)
df['day'] = df['date'].apply(lambda x : x.day)
df['weekofyear'] = df['date'].apply(lambda x : x.weekofyear)
df['hour'] = df['date'].apply(lambda x : x.hour)
df['minute'] = df['date'].apply(lambda x : x.minute)
df.head()

Unnamed: 0,id,noted_date,temp,out/in,date,year,month,day,weekofyear,hour,minute
0,__export__.temp_log_196134_bd201015,08-12-2018 09:30,29.0,0,2018-12-08 09:30:00,2018,12,8,49,9,30
1,__export__.temp_log_196131_7bca51bc,08-12-2018 09:30,29.0,0,2018-12-08 09:30:00,2018,12,8,49,9,30
2,__export__.temp_log_196127_522915e3,08-12-2018 09:29,41.0,1,2018-12-08 09:29:00,2018,12,8,49,9,29
3,__export__.temp_log_196128_be0919cf,08-12-2018 09:29,41.0,1,2018-12-08 09:29:00,2018,12,8,49,9,29
4,__export__.temp_log_196126_d30b72fb,08-12-2018 09:29,31.0,0,2018-12-08 09:29:00,2018,12,8,49,9,29


In [64]:
def change(x):
    if x in [22,23,0,1,2,3]:
        t = 0
    elif x in range(4, 12):
        t = 1
    elif x in range(12, 17):
        t = 2
    elif x in range(17, 22):
        t = 3
    else:
        t = 'X'
    return t

In [65]:
df['timing'] = df['hour'].apply(change)
df.head()

Unnamed: 0,id,noted_date,temp,out/in,date,year,month,day,weekofyear,hour,minute,timing
0,__export__.temp_log_196134_bd201015,08-12-2018 09:30,29.0,0,2018-12-08 09:30:00,2018,12,8,49,9,30,1
1,__export__.temp_log_196131_7bca51bc,08-12-2018 09:30,29.0,0,2018-12-08 09:30:00,2018,12,8,49,9,30,1
2,__export__.temp_log_196127_522915e3,08-12-2018 09:29,41.0,1,2018-12-08 09:29:00,2018,12,8,49,9,29,1
3,__export__.temp_log_196128_be0919cf,08-12-2018 09:29,41.0,1,2018-12-08 09:29:00,2018,12,8,49,9,29,1
4,__export__.temp_log_196126_d30b72fb,08-12-2018 09:29,31.0,0,2018-12-08 09:29:00,2018,12,8,49,9,29,1
