In [None]:
import numpy as np
import pandas as pd
from scipy.stats import skew, kurtosis
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve, classification_report
import matplotlib.pyplot as plt

In [None]:
x_test = pd.read_csv('/content/drive/MyDrive/ML_projects/x-test.csv')
x_train = pd.read_csv('/content/drive/MyDrive/ML_projects/x-train.csv')
y_test = pd.read_csv('/content/drive/MyDrive/ML_projects/y-test.csv')
y_train = pd.read_csv('/content/drive/MyDrive/ML_projects/y-train.csv')

In [None]:
x_test.head()

In [None]:
x_train.head()

In [None]:
# Plot time series for all unique IDs in x_train
unique_ids_to_plot = x_train['id'].unique()

for id in range(2):
    sample_series = x_train[x_train['id'] == id].sort_values(by='time')

    plt.figure(figsize=(200,5)) # Increased width
    plt.plot(sample_series['time'], sample_series['value'])
    plt.title(f'Time Series for ID: {id}')
    plt.xlabel('Time')
    plt.ylabel('Value')
    plt.grid(True)
    plt.show()

In [None]:
y_test.head()

In [None]:
y_train.head()

In [None]:
# Use the x_train and y_train dataframes loaded previously
# Assuming x_train has columns 'id', 'time', 'value' and y_train has 'id', 'structural_breakpoint'

# Merge x_train and y_train to have labels associated with each time series
# We need to ensure that the IDs in x_train and y_train match.
# Let's assume y_train contains the ground truth for the IDs present in x_train.
# We will process each unique time series (identified by 'id') in x_train.

unique_ids = x_train['id'].unique()
data = []
labels = []

# Determine a reasonable break point based on the structure of the time series data
# If the time series length varies, this approach might need adjustment.
# For now, let's assume a fixed break point or a method to estimate it per series.
# Given the example data structure, let's assume a break point can be defined relative to the series length.
# A simple approach is to take the midpoint, but a more robust method might be needed depending on the data.
# For this example, let's assume a fixed break point of 100 or try to find a common length.
# Let's first check the length of a few time series to get an idea.
# Assuming time series are of similar length and break point is around the middle.
# A more sophisticated approach would involve change point detection algorithms.

# Let's use a simplified approach for demonstration, assuming break point is at half the length.
# However, the original synthetic data used a fixed break_point=100. Let's try to stick to that if possible.
# Let's check if the time series in x_train have at least 200 points as in the synthetic data.

# Let's refine the feature extraction to handle time series of varying lengths if necessary,
# or filter for time series of a specific length if a fixed break point is required.
# For now, let's assume we can use a fixed break_point=100 as in the original code,
# and filter out time series shorter than 200 if needed.

def extract_features(series, break_point=100):
    if len(series) < 2 * break_point:
        # Handle short series, perhaps return NaNs or skip
        return [np.nan] * 5

    before = series[:break_point]
    after = series[break_point:2*break_point] # Take equal length segments

    # Handle cases where segments might be too short after splitting
    if len(before) < break_point or len(after) < break_point:
         return [np.nan] * 5

    d_mean = np.mean(after) - np.mean(before)
    d_var = np.var(after) - np.var(before)
    d_std = np.std(after) - np.std(before)

    # Handle cases with insufficient data points for skew/kurtosis
    if len(before) < 2 or len(after) < 2: # Minimum 2 points for skew/kurtosis calculation
         return [d_mean, d_var, d_std, np.nan, np.nan]

    d_skew = skew(after) - skew(before)
    d_kurt = kurtosis(after) - kurtosis(before)

    return [d_mean, d_var, d_std, d_skew, d_kurt]

features_list = []
labels_list = []
processed_ids = []

for id in unique_ids:
    time_series_data = x_train[x_train['id'] == id].sort_values(by='time')['value'].values
    features = extract_features(time_series_data, break_point=100)

    # Only add features and labels if features are not NaN (i.e., time series was long enough)
    if not any(np.isnan(features)):
        features_list.append(features)
        # Find the corresponding label in y_train
        label_row = y_train[y_train['id'] == id]
        if not label_row.empty:
            labels_list.append(label_row['structural_breakpoint'].iloc[0])
            processed_ids.append(id)


feature_matrix = np.array(features_list)
labels = np.array(labels_list)

# Create a DataFrame for clarity
df = pd.DataFrame(feature_matrix, columns=['ΔMean', 'ΔVar', 'ΔStd', 'ΔSkew', 'ΔKurt'])
df['Label'] = labels
df['id'] = processed_ids


print("Feature Sample:")
display(df.head())

# -----------------------------------------------
# STEP 3: Prepare Data for Training (using all extracted features from x_train/y_train)
# -----------------------------------------------

X_train = df[['ΔMean', 'ΔVar', 'ΔStd', 'ΔSkew', 'ΔKurt']]
y_train_labels = df['Label']


# -----------------------------------------------
# STEP 4: Train Logistic Regression Model
# Train the model on all extracted features from x_train and y_train
# -----------------------------------------------
model = LogisticRegression()
model.fit(X_train, y_train_labels)

# -----------------------------------------------
# STEP 5: Evaluation (Evaluation on separate x_test/y_test will be done in another cell)
# You can still print a message indicating model training is complete.
# -----------------------------------------------
print("\nLogistic Regression Model trained on x_train and y_train.")

In [None]:
# Use the x_test dataframe loaded previously
# Assuming x_test has columns 'id', 'time', 'value'

unique_test_ids = x_test['id'].unique()
test_features_list = []
test_processed_ids = []

for id in unique_test_ids:
    time_series_data = x_test[x_test['id'] == id].sort_values(by='time')['value'].values
    features = extract_features(time_series_data, break_point=100)

    # Only add features if features are not NaN (i.e., time series was long enough)
    if not any(np.isnan(features)):
        test_features_list.append(features)
        test_processed_ids.append(id)

test_feature_matrix = np.array(test_features_list)

# Create a DataFrame for clarity
df_test = pd.DataFrame(test_feature_matrix, columns=['ΔMean', 'ΔVar', 'ΔStd', 'ΔSkew', 'ΔKurt'])
df_test['id'] = test_processed_ids

print("Test Feature Sample:")
display(df_test.head())

# Predict on the extracted test features using the trained model from the previous cell
# Ensure the 'model' variable is available from a previous cell (e.g., _r1Pw6oZBP7w)
X_test = df_test[['ΔMean', 'ΔVar', 'ΔStd', 'ΔSkew', 'ΔKurt']]
y_pred_test = model.predict(X_test)
y_prob_test = model.predict_proba(X_test)[:, 1] # Get probabilities for ROC AUC

# Convert boolean predictions to 1/0
y_pred_test_int = y_pred_test.astype(int)

# Create a DataFrame with id and predictions
predictions_df = pd.DataFrame({'id': test_processed_ids, 'predicted_structural_break': y_pred_test_int})

# Create a DataFrame with id and probabilities (useful for ROC AUC calculation later)
prob_df = pd.DataFrame({'id': test_processed_ids, 'predicted_prob_structural_break': y_prob_test})


print("\nTest Predictions (1/0 format):")
display(predictions_df.head())

# Note: Evaluation against y_test is typically done in a separate cell
# to clearly distinguish prediction from evaluation steps.

In [None]:
# Evaluate the model's predictions against the y_test dataset

# Ensure predictions_df, prob_df, and y_test are available from previous cells
# predictions_df contains the 1/0 predictions
# prob_df contains the prediction probabilities
# y_test contains the true labels

# Merge predictions_df with y_test to evaluate against true labels
merged_test_results = pd.merge(predictions_df, y_test, on='id', how='left')

# Drop rows where y_test label is missing (for IDs in predictions_df but not in y_test)
merged_test_results.dropna(subset=['structural_breakpoint'], inplace=True)

# Ensure the true labels are boolean for evaluation metrics
merged_test_results['structural_breakpoint'] = merged_test_results['structural_breakpoint'].astype(bool)


print("\nEvaluation on Test Data:")
print(classification_report(merged_test_results['structural_breakpoint'], merged_test_results['predicted_structural_break'], zero_division=0))
print("Confusion Matrix:\n", confusion_matrix(merged_test_results['structural_breakpoint'], merged_test_results['predicted_structural_break']))
print("Accuracy:", accuracy_score(merged_test_results['structural_breakpoint'], merged_test_results['predicted_structural_break']))

# Merge prob_df to include probabilities for ROC AUC calculation
merged_test_results_with_prob = pd.merge(merged_test_results, prob_df, on='id', how='left')

print("ROC AUC Score:", roc_auc_score(merged_test_results_with_prob['structural_breakpoint'], merged_test_results_with_prob['predicted_prob_structural_break']))

# Optional: Plot ROC Curve for test data
fpr_test, tpr_test, thresholds_test = roc_curve(merged_test_results_with_prob['structural_breakpoint'], merged_test_results_with_prob['predicted_prob_structural_break'])
plt.figure(figsize=(8, 6))
plt.plot(fpr_test, tpr_test, color='blue', label='ROC curve (AUC = %0.2f)' % roc_auc_score(merged_test_results_with_prob['structural_breakpoint'], merged_test_results_with_prob['predicted_prob_structural_break']))
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Structural Break Detection (Test Data)')
plt.legend()
plt.show()