In [None]:
# About Dataset
# Context
# Different cricket species produce different types of sounds. For instance, in Gryllus bimaculatus (field cricket), the 
# chirping sounds can reach up to 100 decibels, while Gryllotalpa vineae (mole cricket) chirps at about 88 decibels. For 
# comparison, a car’s horn reaches about 110 decibels, and referee whistles can be approximately 115–125 decibels.

# Male crickets produce sounds by rubbing their leathery front wings together, i.e., file-like serrations on the wings’ edges 
# rub against a sharp edge (scraper). This is called “stridulation” and is used to attract female crickets as mates. When this 
# sound is being produced, the cricket’s wings are elevated. Each time the wings rub together, this is called a “pulse” and the
# pulse rate is impacted by factors such as temperature, e.g., faster rate during warmer temperatures. Pulse rate and the pattern
# of the pulses also differ between cricket species.


# The major features are Time, Chirps_15s (Chirps count in 15 seconds) and Temp_C (Temperature(C°) at the time of chirps)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('Cricket_chirps.csv')
df

Unnamed: 0,X,Y
0,88.599998,19
1,71.599998,16
2,93.300003,22
3,84.300003,17
4,80.599998,19
...,...,...
1494,81.000000,17
1495,81.000000,17
1496,86.000000,17
1497,83.000000,20


In [3]:
# x : number of chirps in 15 seconds and y: Temp ( degree celcius)

In [4]:
df.shape

(1499, 2)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1499 entries, 0 to 1498
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   X       1499 non-null   float64
 1   Y       1499 non-null   int64  
dtypes: float64(1), int64(1)
memory usage: 23.6 KB


In [6]:
# no null vallues as we can see from the above

In [14]:
x = df['X'].values.reshape(-1,1)
y = df['Y']

In [16]:
# Lets create the linear regression model:
# step 1 : splitting the data into train and test:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,train_size=0.8,random_state=42)

In [17]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(x_train,y_train)

In [20]:
y_pred = lr.predict(x_test)

In [21]:
y_pred

array([18.48310996, 17.07575518, 18.17036445, 18.63948271, 18.17036445,
       17.38850068, 19.10860097, 18.3267372 , 17.70124619, 16.13751866,
       17.85761894, 18.0139917 , 18.95222822, 15.35565489, 16.29389141,
       17.07575518, 18.17036445, 18.79585546, 17.23212793, 15.9811459 ,
       15.51202764, 19.26497372, 17.07575518, 15.6684004 , 17.85761894,
       16.91938242, 18.95222822, 18.3267372 , 18.17036445, 15.04290938,
       17.70124619, 19.10860097, 16.13751866, 17.70124619, 18.17036445,
       17.54487344, 19.10860097, 16.91938242, 15.51202764, 18.48310996,
       16.76300967, 15.9811459 , 16.45026416, 19.10860097, 18.17036445,
       16.29389141, 18.3267372 , 17.54487344, 16.91938242, 18.17036445,
       17.38850068, 18.0139917 , 16.45026416, 18.63948271, 19.42134648,
       16.60663692, 15.6684004 , 18.0139917 , 15.82477315, 19.26497372,
       17.85761894, 15.82477315, 17.23212793, 17.70124619, 15.04290938,
       19.26497372, 15.35565489, 18.17036445, 18.0139917 , 15.82

In [22]:
y_test

1116    20
1395    17
422     17
413     21
451     18
        ..
983     19
799     17
1353    17
1149    18
824     15
Name: Y, Length: 300, dtype: int64

In [25]:
df_pred = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred})
df_pred.head(20)

Unnamed: 0,y_test,y_pred
1116,20,18.48311
1395,17,17.075755
422,17,18.170364
413,21,18.639483
451,18,18.170364
861,16,17.388501
1063,19,19.108601
741,19,18.326737
1271,18,17.701246
259,17,16.137519


In [27]:
from sklearn.metrics import r2_score
r2score = r2_score(y_test,y_pred)
r2score

0.5309933656482937

In [36]:
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor(max_depth=4)
dtr.fit(x_train,y_train)

In [37]:
dtr_y_pred = dtr.predict(x_test)

In [38]:
r2sc = r2_score(y_test,dtr_y_pred)
r2sc

0.5719420118176819

In [39]:
# hyperparamter tuning:
# checking the best parameter values:
c_parm = 'max_depth'
c_parm_val = list(range(1,21))
dframe = pd.DataFrame(columns=[c_parm,'accuracy'])

for input_parameter in c_parm_val:
    dt = DecisionTreeRegressor(max_depth=input_parameter,splitter='best')
    dt.fit(x_train,y_train) 
    y_pred = dt.predict(x_test) 
    r2score = r2_score(y_test,y_pred)*100
    dframe = dframe.append({c_parm:input_parameter, 'accuracy':  r2score}, ignore_index=True) 
print(dframe)
print('')

    max_depth   accuracy
0         1.0  37.592456
1         2.0  53.902065
2         3.0  56.904240
3         4.0  57.194201
4         5.0  56.925442
5         6.0  56.889813
6         7.0  56.934158
7         8.0  56.821185
8         9.0  56.876148
9        10.0  56.876148
10       11.0  56.876148
11       12.0  56.876148
12       13.0  56.876148
13       14.0  56.876148
14       15.0  56.876148
15       16.0  56.876148
16       17.0  56.876148
17       18.0  56.876148
18       19.0  56.876148
19       20.0  56.876148



In [40]:
# applied above based on the results.

In [41]:
# hyperparamter tuning:
param_grid = {
    'max_depth': [None, 5, 10, 15],  
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [44]:
# hyperparamter tuning:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(dt, param_grid, cv=5, scoring='neg_mean_squared_error')

In [46]:
grid_search.fit(x_train, y_train)

In [47]:
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

Best Parameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2}


In [48]:
best_model = grid_search.best_estimator_

In [50]:
y_pred_dt = best_model.predict(x_test)

In [52]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred_dt)
print("Mean Squared Error:", mse)

Mean Squared Error: 1.4435842935525218


In [54]:
r2score_ = r2_score(y_test, y_pred_dt)
r2score_*100

56.700986666046695