In [229]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, GradientBoostingRegressor, BaggingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from catboost import CatBoostRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

In [230]:
train = pd.read_csv('Train.csv')
train.head()

Unnamed: 0,signal_quality_dbm,network_delay_ms,latency_variation_ms,connect_type,mobile_device_model,service_provider,battery_charge(%),device_temperature (°C),network_switch_count,measured_download_Speed (Mbps)
0,-71.423256,18.57143,2.306581,5G NSA,Pixel 7,Jio,95.30171,41.734196,2.98202,198.116744
1,-103.149258,1.792135,1.838483,5G NSA,Pixel 7,Verizon,65.138797,42.697748,3.017341,992.344936
2,-61.282414,18.281477,0.168402,4G,Nord 4,AT&T,44.627951,30.01171,0.023513,241.67503
3,-87.742155,5.556565,2.665556,4G,iPhone 14,T-Mobile,39.018512,26.792422,1.021782,145.579935
4,-90.75666,15.647159,0.609766,5G SA,Galaxy S23,AT&T,71.65114,37.52673,4.042969,624.473904


In [231]:
train.shape

(15000, 10)

In [232]:
train.isnull().sum()

signal_quality_dbm                0
network_delay_ms                  0
latency_variation_ms              0
connect_type                      0
mobile_device_model               0
service_provider                  0
battery_charge(%)                 0
device_temperature (°C)           0
network_switch_count              0
measured_download_Speed (Mbps)    0
dtype: int64

In [233]:
train.duplicated().sum()

0

In [234]:
train.describe()

Unnamed: 0,signal_quality_dbm,network_delay_ms,latency_variation_ms,battery_charge(%),device_temperature (°C),network_switch_count,measured_download_Speed (Mbps)
count,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0
mean,-85.38806,11.023173,2.488534,54.67281,32.741387,1.998027,577.612761
std,14.916568,5.541652,1.443823,25.963792,7.141998,1.372455,260.557493
min,-110.645445,0.766634,0.02549,8.805655,19.682478,-0.093823,94.502385
25%,-98.845419,6.424588,1.199271,32.770321,26.6281,0.986948,355.768869
50%,-85.817138,10.975361,2.519074,53.10524,32.900237,2.000626,605.360857
75%,-72.194135,15.997373,3.745929,76.549537,38.976847,3.012709,798.837586
max,-59.538634,20.202308,5.06196,100.556976,45.399578,4.094574,1007.547456


In [235]:
test = pd.read_csv('Test.csv')
test.head()

Unnamed: 0,signal_quality_dbm,network_delay_ms,latency_variation_ms,connect_type,mobile_device_model,service_provider,battery_charge(%),device_temperature (°C),network_switch_count,measured_download_Speed (Mbps)
0,-80.123627,18.493582,0.537381,5G NSA,GT 7,T-Mobile,45.902728,34.87902,1.020813,
1,-96.316748,18.766166,4.715738,4G,Pixel 7,T-Mobile,36.633136,37.755433,0.980714,
2,-86.202083,17.223725,2.071679,5G SA,GT 7,AT&T,31.370516,25.553739,0.994984,
3,-89.649209,2.81236,3.666943,5G SA,Nord 4,Vi,16.795356,31.405906,1.986516,
4,-103.751362,4.272675,2.496559,5G SA,Pixel 7,T-Mobile,41.23123,38.141277,0.97889,


In [236]:
test.shape

(200, 10)

In [237]:
test.isnull().sum()

signal_quality_dbm                  0
network_delay_ms                    0
latency_variation_ms                0
connect_type                        0
mobile_device_model                 0
service_provider                    0
battery_charge(%)                   0
device_temperature (°C)             0
network_switch_count                0
measured_download_Speed (Mbps)    200
dtype: int64

In [238]:
test.duplicated().sum()

0

In [239]:
combined = pd.concat([train, test], axis = 0)
combined.shape

(15200, 10)

In [240]:
combined['signal_to_delay_ratio'] = combined['signal_quality_dbm'] / (combined['network_delay_ms'] + 1)
combined['latency_stability'] = 1 / (combined['latency_variation_ms'] + 1)
combined['battery_temp_ratio'] = combined['battery_charge(%)'] / (combined['device_temperature (°C)'] + 1)
combined['delay_times_switch'] = combined['network_delay_ms'] * combined['network_switch_count']
combined['temp_x_signal'] = combined['device_temperature (°C)'] * combined['signal_quality_dbm']

In [241]:
def categorize_signal(dbm):
    if dbm >= -70:
        return 'Strong'
    elif dbm >= -90:
        return 'Moderate'
    else:
        return 'Weak'

combined['signal_category'] = combined['signal_quality_dbm'].apply(categorize_signal)

In [242]:
combined['temp_level'] = pd.cut(combined['device_temperature (°C)'], bins=[-np.inf, 35, 45, np.inf], labels=['Cool', 'Normal', 'Hot'])

In [243]:
combined['device_model_avg_speed'] = combined.groupby('mobile_device_model')['measured_download_Speed (Mbps)'].transform('mean')
combined['provider_avg_delay'] = combined.groupby('service_provider')['network_delay_ms'].transform('mean')

In [244]:
combined['normalized_signal'] = (combined['signal_quality_dbm'] + 120) / 60

In [245]:
combined = pd.get_dummies(combined, columns=['connect_type', 'mobile_device_model', 'service_provider', 'signal_category', 'temp_level'],
    drop_first=True)

In [246]:
categorical_variables = combined.select_dtypes(include= 'object')
le = LabelEncoder()
for i in categorical_variables:
    combined[i] = le.fit_transform(combined[i])

In [247]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15200 entries, 0 to 199
Data columns (total 31 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   signal_quality_dbm              15200 non-null  float64
 1   network_delay_ms                15200 non-null  float64
 2   latency_variation_ms            15200 non-null  float64
 3   battery_charge(%)               15200 non-null  float64
 4   device_temperature (°C)         15200 non-null  float64
 5   network_switch_count            15200 non-null  float64
 6   measured_download_Speed (Mbps)  15000 non-null  float64
 7   signal_to_delay_ratio           15200 non-null  float64
 8   latency_stability               15200 non-null  float64
 9   battery_temp_ratio              15200 non-null  float64
 10  delay_times_switch              15200 non-null  float64
 11  temp_x_signal                   15200 non-null  float64
 12  device_model_avg_speed          15200 n

In [248]:
combined.shape

(15200, 31)

In [249]:
newtrain = combined.iloc[0:15000, :]
newtest = combined.iloc[15000:15200 , :]
##newtest = combined.drop('measured_download_Speed (Mbps)', axis = 1)

In [250]:
newtrain.shape

(15000, 31)

In [251]:
newtest.shape

(200, 31)

In [277]:
newtest = newtest.drop('measured_download_Speed (Mbps)', axis = 1)
newtest.shape

(200, 30)

In [279]:
x = newtrain.drop('measured_download_Speed (Mbps)', axis = 1)
y = newtrain['measured_download_Speed (Mbps)']

In [281]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 1)

In [283]:
models = {'Linear Regression': LinearRegression(), 'Random Forest': RandomForestRegressor(),
         'Bagging': BaggingRegressor(), 'Extra Tree': ExtraTreesRegressor(), 'LightGBM': LGBMRegressor(),
         'Gradient Boosting': GradientBoostingRegressor(), 'Adaboost': AdaBoostRegressor(),
         'XGB': XGBRegressor()}

In [285]:
def evaluate_models(x_train, x_test, y_train, y_test, models):
    results = {}
    for name, model in models.items():
        predictions = model.fit(x_train, y_train).predict(x_test)
        accuracy = r2_score(y_test, predictions)
        results[name] = accuracy
    return results

In [287]:
results = evaluate_models(x_train, x_test, y_train, y_test, models)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000518 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3106
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 30
[LightGBM] [Info] Start training from score 576.775462


In [288]:
best_model_name = max(results, key = results.get)
best_model = models[best_model_name]

In [289]:
print(f"best model is {best_model_name} with r2_score {results[best_model_name]}")

best model is Extra Tree with r2_score 0.9993521177385281


In [290]:
y_pred = best_model.fit(x_train, y_train).predict(x_test)

In [291]:
print(np.sqrt(r2_score(y_test, y_pred)))

0.9996734585933421


In [297]:
x_train = newtrain.drop('measured_download_Speed (Mbps)', axis = 1)
y_train = newtrain['measured_download_Speed (Mbps)']
x_test = newtest
y_pred = best_model.fit(x_train, y_train).predict(x_test)

In [298]:
solution = pd.DataFrame({'measured_download_Speed (Mbps)': y_pred})
solution.head()

Unnamed: 0,measured_download_Speed (Mbps)
0,881.565787
1,827.5333
2,845.676008
3,624.193686
4,759.079842


In [299]:
solution.to_csv('Solution.csv', index = False)