In [3]:
import pandas as pd
from sklearn.metrics import accuracy_score

import numpy as np
import os
from sklearn import linear_model

import matplotlib.pyplot as plt

import seaborn as sns

from tqdm import tqdm
from tqdm import tqdm_notebook

from scipy.signal import hilbert
from scipy.signal import hann
from scipy.signal import convolve
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVR
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

In [None]:
train = pd.read_csv('train.csv', dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64})

In [None]:
print('Shape is ', train.shape)
train.isnull().sum()

In [None]:
train.keys()

In [None]:
pd.options.display.precision = 15

In [None]:
#tyring to make a training dataset of 150000 sampled rows.
#So we are dividing the train into segments of our total length by 
#the number of rows i.e we are sampling a part of train set.rows = 150_000

rows = 150_000
segments = int(np.floor(train.shape[0] / rows))

#This "statistical features" method can be improved by replacing 4 features 
#["ave", "std", "min", "max"] with 103 features ["ave", "std", "x0", "x1", "x2", …, "x98", "x99", "x100"], 
#where "x5" denotes the maximum value among all values covering 5% of the population.

X_train = pd.DataFrame(index=range(segments), dtype=np.float64,
                       columns=['ave', 'std', 'max', 'min','10th_per','20th_per','30th_per','40th_per','50th_per',
                                '60th_per','70th_per','80th_per','90th_per','100th_per'])
y_train = pd.DataFrame(index=range(segments), dtype=np.float64,
                       columns=['time_to_failure'])

#why this: [segment * rows : segment * rows + rows] ?

#grouping samples. Then for each group the diverse features, maximum, standard deviation, 
#and so on, are computed. The grouping is carried out to be able to compute these features 

for segment in tqdm(range(segments)):
    seg = train.iloc[segment*rows:segment*rows+rows]
    x = seg['acoustic_data'].values
    y = seg['time_to_failure'].values[-1]
    
    y_train.loc[segment, 'time_to_failure'] = y
    
    X_train.loc[segment, 'ave'] = x.mean()
    X_train.loc[segment, 'std'] = x.std()
    X_train.loc[segment, 'max'] = x.max()
    X_train.loc[segment, 'min'] = x.min()
    X_train.loc[segment, '10th_per'] = np.percentile(x, 10)
    X_train.loc[segment, '20th_per'] = np.percentile(x, 20)
    X_train.loc[segment, '30th_per'] = np.percentile(x, 30)
    X_train.loc[segment, '40th_per'] = np.percentile(x, 40)
    X_train.loc[segment, '50th_per'] = np.percentile(x, 50)
    X_train.loc[segment, '60th_per'] = np.percentile(x, 60)
    X_train.loc[segment, '70th_per'] = np.percentile(x, 70)
    X_train.loc[segment, '80th_per'] = np.percentile(x, 80)
    X_train.loc[segment, '90th_per'] = np.percentile(x, 90)
    X_train.loc[segment, '100th_per'] = np.percentile(x, 100)


In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)

In [None]:
import sklearn.linear_model
from sklearn.linear_model import LogisticRegression

In [None]:
# Simple Linear Model

In [None]:
lm = linear_model.LinearRegression()
model = lm.fit(X_train_scaled,y_train)

In [None]:
predictions = lm.predict(X_train_scaled)
print(predictions)

In [None]:
score = mean_absolute_error(y_train.values, predictions)
print(f'Score: {score:0.3f}')

In [None]:
lm.score(X_train,y_train)

In [None]:
lm.coef_

In [None]:
lm.intercept_

In [None]:
submission = pd.read_csv('sample_submission.csv', index_col='seg_id')

In [None]:
X_test = pd.DataFrame(columns=X_train.columns, dtype=np.float64, index=submission.index)

In [None]:
print("There are {} files in test folder".format(len(os.listdir('test' ))))

In [None]:
for seg_id in X_test.index:
    seg = pd.read_csv('test/' + seg_id + '.csv')
    
    x = seg['acoustic_data'].values
    
    X_test.loc[seg_id, 'ave'] = x.mean()
    X_test.loc[seg_id, 'std'] = x.std()
    X_test.loc[seg_id, 'max'] = x.max()
    X_test.loc[seg_id, 'min'] = x.min()

In [None]:
X_test_scaled = scaler.transform(X_test)
submission['time_to_failure'] = lm.predict(X_test_scaled)
submission.to_csv('submission.csv')

In [None]:
#Random Forest

In [None]:
from sklearn import metrics

print('MAE:', metrics.mean_absolute_error(y_train, predictions))
print('MSE:', metrics.mean_squared_error(y_train, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_train, predictions)))

In [None]:
plt.scatter(y_train,predictions)
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')

In [None]:
submission = pd.read_csv('sample_submission.csv', index_col='seg_id')

In [None]:
X_test = pd.DataFrame(columns=X_train.columns, dtype=np.float64, index=submission.index)

In [None]:
for seg_id in X_test.index:
    seg = pd.read_csv('test/' + seg_id + '.csv')
    
    x = seg['acoustic_data'].values
    
    X_test.loc[seg_id, 'ave'] = x.mean()
    X_test.loc[seg_id, 'std'] = x.std()
    X_test.loc[seg_id, 'max'] = x.max()
    X_test.loc[seg_id, 'min'] = x.min()

In [None]:
X_test_scaled = scaler.transform(X_test)
submission['time_to_failure'] = lm.predict(X_test_scaled)
submission.to_csv('submission.csv')

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = X_train[['ave', 'std', 'max', 'min']]

y = y_train['time_to_failure']
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.30)

In [None]:
from sklearn.model_selection import cross_val_predict, GridSearchCV,cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, r2_score

In [None]:
rfr = RandomForestRegressor(random_state=0, n_estimators = 600, criterion='mae')

In [None]:
cv_scores = cross_val_score(rfr, X, y, cv = 5)

In [None]:
rfr.fit(X_train,y_train)

In [None]:
predictions = rfr.predict(X_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
print(rfr.score(X_test, y_test))

In [None]:
plt.scatter(y_test,predictions)
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')

In [None]:
submission = pd.read_csv('sample_submission.csv', index_col='seg_id')

In [None]:
X_test = pd.DataFrame(columns=X_train.columns, dtype=np.float64, index=submission.index)

In [None]:
for seg_id in X_test.index:
    seg = pd.read_csv('test/' + seg_id + '.csv')
    
    x = seg['acoustic_data'].values
    
    X_test.loc[seg_id, 'ave'] = x.mean()
    X_test.loc[seg_id, 'std'] = x.std()
    X_test.loc[seg_id, 'max'] = x.max()
    X_test.loc[seg_id, 'min'] = x.min()


In [None]:
X_test_scaled = scaler.transform(X_test)
submission['time_to_failure'] = rfr.predict(X_test)
submission.to_csv('submission.csv')

In [None]:
#sample model

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)

In [None]:
svm = NuSVR()
svm.fit(X_train_scaled, y_train.values.flatten())
y_pred = svm.predict(X_train_scaled)
print(y_pred)

In [None]:
score = mean_absolute_error(y_train.values.flatten(), y_pred)
print(f'Score: {score:0.3f}')

In [None]:
svm.score(X_train,y_train)

In [None]:
submission = pd.read_csv('sample_submission.csv', index_col='seg_id')

In [None]:
X_test = pd.DataFrame(columns=X_train.columns, dtype=np.float64, index=submission.index)

In [None]:
print("There are {} files in test folder".format(len(os.listdir('test' ))))

In [None]:
for seg_id in X_test.index:
    seg = pd.read_csv('test/' + seg_id + '.csv')
    
    x = seg['acoustic_data'].values
    
    X_test.loc[seg_id, 'ave'] = x.mean()
    X_test.loc[seg_id, 'std'] = x.std()
    X_test.loc[seg_id, 'max'] = x.max()
    X_test.loc[seg_id, 'min'] = x.min()

In [None]:
X_test_scaled = scaler.transform(X_test)
submission['time_to_failure'] = svm.predict(X_test_scaled)
submission.to_csv('submission.csv')