In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

## Data preperation

In [20]:
df = pd.read_csv('data.csv', header=None)
df = df.T
df.fillna(0, inplace=True)

In [3]:
labels_names = {
    'nominal': 1,
    'ordinal': 2,
    'numerical': 3,
    'textual': 4,
    'date': 5, 
    'unknown': 0
}

##### Tried finding common words in the title, tried including the ratio of special characters as a feature, but these did not affect the performance at all. 

##### Reducing the data classes from 5 to 4 (by combining 'date' and 'textual') does not affect the accuarcy of the 'ordinal', 'nominal', and 'numerical' classes. I combined them as I say there is no added benefit in keeping them separate.

In [72]:
def infer_data_type_from_title(title):
    # Convert title to lower case for case-insensitive matching
    title_lower = title.lower()

    # Keywords associated with different data types
    categorical_keywords = ['type', 'category', 'class', 'group', 'status', 'id', 'code', 'tag', 'gender', 'color', 'brand',
                            'name', 'city', 'country', 'level', 'grade', 'rank', 'stage', 'rating', 'score', 'priority', 
                            'severity', 'size', 'quality']
    
    numerical_keywords = ['number', 'amount', 'rate', 'price', 'cost', 'value', 'volume', 'length', 'weight', 'height', 
                          'width', 'depth', 'temperature', 'salary', 'count', 'percentage', 'score', 'distance', 'age',
                          'hour', 'minute', 'second', 'period', 'duration']
    
    textual_keywords = ['description', 'notes', 'comments', 'text', 'address', 'summary', 'content', 'remarks', 
                        'message', 'date', 'time', 'year', 'month', 'day', 'hour', 'minute', 'second', 'timestamp', 
                        'period', 'duration', 'deadline']

    # Initialize one-hot encoded list
    one_hot = [0, 0, 0]

    # Check for presence of keywords in the title and set the corresponding index to 1
    if any(keyword in title_lower for keyword in categorical_keywords):
        one_hot[0] = 1
    if any(keyword in title_lower for keyword in numerical_keywords):
        one_hot[1] = 1
    if any(keyword in title_lower for keyword in textual_keywords):
        one_hot[2] = 1

    return one_hot

def count_delimiters(data_cells):
    delimiters = ['/', '-', ':']
    delimiters_ratio = []
    
    for delimiter in delimiters:
        delimiters_ratio.append(sum(cell.count(delimiter) for cell in data_cells.astype(str)) / len(data_cells))

    return delimiters_ratio

# # Initialize counts for special characters
# special_chars = {'%': 0, '-': 0, '#': 0, '/': 0, '$': 0, '£': 0, '€': 0}

# # Count occurrences of each special character
# for cell in data_cells:
#     for char in special_chars:
#         if char in str(cell):
#             special_chars[char] += 1

# # Calculate the ratio of counts to the total number of cells
# total_cells = len(data_cells)
# special_char_ratios = [count / total_cells for count in special_chars.values()]

In [73]:
def text_to_number_ratio(cell):
    """
    Calculate the ratio of text characters to number characters in a cell.
    """
    text_chars = sum(c.isalpha() for c in str(cell))
    number_chars = sum(c.isdigit() for c in str(cell))
    total_chars = text_chars + number_chars
    return text_chars / number_chars if number_chars > 0 else 0

def analyze_column(column):
    """
    Analyzes a single column and returns various statistics based on the content, with improvements.features

    :param column: Pandas Series representing a column from a DataFrame.
    :return: Dictionary containing various statistics about the column.
    """

    title = column.iloc[0]
    # label = column.iloc[-1]
    data_cells = column.iloc[1:-1]

    
    # Apply the ratio calculation to each cell and then find the average
    avg_text_to_number_ratio = data_cells.apply(text_to_number_ratio).mean()

    # Ratio of number of unique values to total number of values
    unique_to_total_ratio = data_cells.nunique() / len(data_cells)

    # Convert data_cells to numeric, coerce errors, and then drop NaNs for numeric calculations
    numeric_column = pd.to_numeric(data_cells, errors='coerce').dropna()

    # Calculate the percentage of non-NaN values
    percentage_non_nan = len(numeric_column) / len(data_cells) * 100

    # Min, Max, Mean, Median, Std for numeric data
    if percentage_non_nan >= 80:
        min_val = numeric_column.min()
        max_val = numeric_column.max()
        mean_val = numeric_column.mean()
        median_val = numeric_column.median()
        std_val = numeric_column.std()
        min_str_length = max_str_length = mean_str_length = median_str_length = std_str_length = 0
        avg_dashes = 0
        avg_slashes = 0
        
    else:  # Non-numeric data
        min_val = max_val = mean_val = median_val = std_val = 0
        str_lengths = data_cells.apply(lambda x: len(str(x)))
        min_str_length = str_lengths.min()
        max_str_length = str_lengths.max()
        mean_str_length = str_lengths.mean()
        median_str_length = str_lengths.median()
        std_str_length = str_lengths.std()
    
    return [
        avg_text_to_number_ratio, unique_to_total_ratio,
        min_val, max_val, mean_val, median_val, std_val,
        min_str_length, max_str_length, mean_str_length, median_str_length, std_str_length
    ]

In [74]:
# Apply the feature engineering function to each row
feature_matrix = df.apply(analyze_column, axis=1)

# Convert the resulting series of lists/arrays into a matrix
feature_matrix = np.array(feature_matrix.tolist())
labels = df.iloc[:, -1]
labels = labels.str.replace('date', 'textual', case=False, regex=True)
# Split the data into training-testing
X_train, X_test, y_train, y_test = train_test_split(feature_matrix, labels, test_size=0.2, random_state=42)

In [75]:
feature_matrix[0].shape

(12,)

## Random forests

In [76]:
# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Train the model
rf_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

 Categorical       0.75      0.82      0.78        11
   Numerical       0.92      1.00      0.96        11
     Textual       0.86      0.67      0.75         9

    accuracy                           0.84        31
   macro avg       0.84      0.83      0.83        31
weighted avg       0.84      0.84      0.83        31

Accuracy: 0.8387096774193549


In [10]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300, 350],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],    # Minimum number of samples required to be at a leaf node
    'max_features': ['log2', 'sqrt'], # Number of features to consider when looking for the best split
    'bootstrap': [True, False]        # Whether bootstrap samples are used when building trees
}


# Initialize the Grid Search model
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Best parameters
print("Best parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_
# Predict on the test set
y_pred = best_model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

Fitting 3 folds for each of 576 candidates, totalling 1728 fits
Best parameters: {'bootstrap': True, 'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}
              precision    recall  f1-score   support

  Categorial       0.89      0.73      0.80        11
   Numerical       0.79      1.00      0.88        11
     Textual       0.88      0.78      0.82         9

    accuracy                           0.84        31
   macro avg       0.85      0.84      0.83        31
weighted avg       0.85      0.84      0.84        31

Accuracy: 0.8387096774193549


#### To save the model as ONNX model

In [24]:
import skl2onnx
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
import onnxruntime as ort
import joblib

# 'model' is the trained scikit-learn model and 'X_test', 'y_test' are the test sets
# Convert to ONNX
initial_type = [('float_input', FloatTensorType([None, X_test.shape[1]]))]
onnx_model = convert_sklearn(rf_classifier, initial_types=initial_type)

# Save the ONNX model to a file
onnx_model_path = 'model.onnx'
with open(onnx_model_path, "wb") as f:
    f.write(onnx_model.SerializeToString())

In [25]:
# Test the performance of the ONNX model
# Load the ONNX model with ONNX Runtime
sess = ort.InferenceSession(onnx_model_path)

# Prepare the input data
input_data = {sess.get_inputs()[0].name: X_test.astype(np.float32)}

# Run inference
onnx_output = sess.run(None, input_data)

onnx_predictions = onnx_output[0]

# Evaluate accuracy
onnx_accuracy = accuracy_score(y_test, onnx_predictions)
print("ONNX Model Accuracy:", onnx_accuracy)

ONNX Model Accuracy: 0.8387096774193549


## Gradient Boosting Machines

In [13]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [280, 300, 320, 340, 360, 380],
    'learning_rate': [0.01],
    'max_depth': [4],
}

gbm = GradientBoostingClassifier()

grid_search = GridSearchCV(estimator=gbm, param_grid=param_grid, cv=10, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# gbm.fit(X_train, y_train)
# y_pred = gbm.predict(X_test)

# print(classification_report(y_test, y_pred))
# print("Accuracy:", accuracy_score(y_test, y_pred))

Fitting 10 folds for each of 6 candidates, totalling 60 fits


In [14]:
print("Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

Best Parameters: {'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 300}
              precision    recall  f1-score   support

  Categorial       0.77      0.91      0.83        11
   Numerical       0.85      1.00      0.92        11
     Textual       0.80      0.44      0.57         9

    accuracy                           0.81        31
   macro avg       0.81      0.78      0.77        31
weighted avg       0.81      0.81      0.79        31

Accuracy: 0.8064516129032258


## SVM

In [15]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = SVC(kernel='rbf')  # 'rbf' kernel is usually a good choice; can also try 'linear', 'poly'
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

  Categorial       0.82      0.82      0.82        11
   Numerical       0.73      1.00      0.85        11
     Textual       0.80      0.44      0.57         9

    accuracy                           0.77        31
   macro avg       0.78      0.75      0.75        31
weighted avg       0.78      0.77      0.76        31

Accuracy: 0.7741935483870968


In [16]:
print("X_train.shape:", X_train.shape)
print("y_train.shape:", y_train.shape)
print("X_test.shape:", X_test.shape)
print("y_test.shape:", y_test.shape)

X_train.shape: (121, 12)
y_train.shape: (121,)
X_test.shape: (31, 12)
y_test.shape: (31,)


## LSTM

In [18]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

max_sequence_length = 12

# Pad sequences
X_train_padded = pad_sequences(X_train, maxlen=max_sequence_length)
X_test_padded = pad_sequences(X_test, maxlen=max_sequence_length)

# Reshape for LSTM input
X_train_padded = X_train_padded.reshape((X_train_padded.shape[0], max_sequence_length, 1))
X_test_padded = X_test_padded.reshape((X_test_padded.shape[0], max_sequence_length, 1))

# Initialize the encoder
encoder = LabelEncoder()
# Fit the encoder and transform the labels to numeric
y_train_numeric = encoder.fit_transform(y_train)
y_test_numeric = encoder.transform(y_test)

# Define model
model = Sequential()
model.add(LSTM(50, return_sequences=True, input_shape=(max_sequence_length, 1)))
model.add(LSTM(50, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train model
model.fit(X_train_padded, y_train_numeric, epochs=4, batch_size=32)

# Evaluate model
loss, accuracy = model.evaluate(X_test_padded, y_test_numeric)
print("Accuracy:", accuracy)




Epoch 1/4


Epoch 2/4
Epoch 3/4
Epoch 4/4
Accuracy: 0.35483869910240173


## KNN

In [19]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier

X = feature_matrix  # Features
y = labels  # Labels

# Encode the labels if they are categorical
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.fit_transform(y_test)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

# Create KNN classifier
knn = KNeighborsClassifier(n_neighbors=3)

# Train the classifier
knn.fit(X_train, y_train)

# Evaluate the classifier
accuracy = knn.score(X_test, y_test)
print(f"Accuracy: {accuracy}")

Accuracy: 0.8064516129032258
