In [16]:
import os
import random
import numpy as np
import tensorflow as tf

# Set the random seeds for reproducibility
SEED = 42
os.environ['PYTHONHASHSEED'] = str(SEED)  # Python hash seed
random.seed(SEED)                         # Python random module seed
np.random.seed(SEED)                      # NumPy seed
tf.random.set_seed(SEED)      
os.environ['TF_DETERMINISTIC_OPS'] = '1'

import pandas as pd
import xgboost as xgb
from joblib import dump, load

# these are dummy models
class MLModel():
    def __init__(self) -> None:
        pass
    
    def train(self, X, y):
        pass
    
    def predict(self, X):
        pass
    
class TextSeqModel(MLModel):
    def __init__(self) -> None:
        # Initialization logic (if needed)
        pass

    # Load and preprocess the training and validation data
    def one_hot_encode_digits(self, strings):
        # Convert strings of digits into a list of digit sequences (lists of ints)
        digit_sequences = [[int(char) for char in string] for string in strings]
        # One-hot encode digits (0-9)
        one_hot_encoded = np.array([tf.keras.utils.to_categorical(seq, num_classes=10) for seq in digit_sequences])
        return one_hot_encoded

    def predict(self, X):
        # Call the instance method with self
        X_encoded = self.one_hot_encode_digits(X)
        
        # Load the model once during initialization
        model = tf.keras.models.load_model('t3_cnn.keras')
        
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

        predictions = model.predict(X_encoded)
        return (predictions > 0.5).astype("int32").flatten()

    def predict_logits(self, X):
        # Call the instance method with self
        X_encoded = self.one_hot_encode_digits(X)
        
        # Load the model once during initialization
        model = tf.keras.models.load_model('t3_cnn.keras')
        
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

        predictions = model.predict(X_encoded)
        return predictions


class EmoticonModel(MLModel):
    def __init__(self) -> None:
        pass

    #Converting string of emoji to multiple features then one hot encode them
    def one_hot_encode_emoticons(self, df_test):

        #Converting feature to lists
        df_test['split_emojis'] = df_test['input_emoticon'].apply(list)

        #assigning only the set of feature vectors as data frame to X_test
        X_test = pd.DataFrame(df_test['split_emojis'].tolist())
        l=['🛠', '🙵', '🙺', '\U0001f6da', '🛃', '🙝', '🙂', '🚬', '🙭', '😈', '🙡', '🚻']
        def replace_emoticon(val):
            if val in l:
                return '😣'  # Replace with '😣' if the value is in the list l
            return val      # Return the original value otherwise

        # Apply the function to each element in the DataFrame X_test
        X_test = X_test.applymap(replace_emoticon)
        #Using OneHotEncoder object on X_test
        encoder=load('encoder.pkl')
        X_test_encoded=encoder.transform(X_test)

        return X_test_encoded
        

    def predict(self, X_test):
        #Call the instance method with self
        X_test_encoded=self.one_hot_encode_emoticons(X_test)
        #Load the model once during initialization
        model=load('SVM.pkl')
        y_pred=model.predict(X_test_encoded)
        return y_pred
    
class FeatureModel(MLModel):
    def __init__(self) -> None:
        pass

    def predict(self, X):
        X.resize(X.shape[0], 13*768)
        sc = load('std_scaler.bin')
        pca = load('pca.bin')
        X_scaled = sc.transform(X)
        X_pca = pca.transform(X_scaled)
        # Load the model once during initialization
        model = tf.keras.models.load_model("TASK1_BEST_MODEL.keras")
        
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

        predictions = model.predict(X_pca)
        return (predictions > 0.5).astype("int32").flatten()
        
    def predict_logits(self,X):
        X.resize(X.shape[0], 13*768)
        sc = load('std_scaler.bin')
        pca = load('pca.bin')
        X_scaled = sc.transform(X)
        X_pca = pca.transform(X_scaled)
        # Load the model once during initialization
        model = tf.keras.models.load_model("TASK1_BEST_MODEL.keras")
        
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

        predictions = model.predict(X_pca)
        return predictions
    
class CombinedModel(MLModel):
    def __init__(self) -> None:
        pass

    def predict(self, X1, X2, X3): # random predictions
        # Step 1: Load the model from the json file
        model_new = EmoticonModel()
        model2 = xgb.XGBClassifier()
        model2.load_model("XGB.json")  # specify the path to your xgb.json file
        
        X2_encoded=model_new.one_hot_encode_emoticons(X2)
        # Step 2: Make predictions using the loaded model and get the probabilities
        probabilities = model2.predict_proba(X2_encoded)
        X_emoticons_probs = probabilities[:, 1]
        X_emoticons_probs.resize(X_emoticons_probs.shape[0],1)

        model1 = FeatureModel()
        X_array_probs = model1.predict_logits(X1)
        
        model3 = TextSeqModel()
        X_text_probs = model3.predict_logits(X3)

        X = np.concatenate((X_emoticons_probs, X_array_probs,X_text_probs), axis = 1)
        scaler = load('std_scaler_combined_model.bin')
        X_scaled = scaler.transform(X)
    
        # Step 1: Load the model from the json file
        bst = xgb.XGBClassifier()
        bst.load_model("XGB_FINAL.json")  # specify the path to your xgb.json file
        y_pred = bst.predict(X_scaled)
        return y_pred

def save_predictions_to_file(predictions, filename):
    with open(filename, 'w') as f:
        for pred in predictions:
            f.write(f"{pred}\n")

if __name__ == '__main__':
    # read datasets
    test_feat_X = np.load("datasets/test/test_feature.npz", allow_pickle=True)['features']
    test_emoticon_X = pd.read_csv("datasets/test/test_emoticon.csv")
    test_seq_X = pd.read_csv("datasets/test/test_text_seq.csv")['input_str'].tolist()

    
    # your trained models 
    feature_model = FeatureModel()
    text_model = TextSeqModel()
    emoticon_model  = EmoticonModel()
    best_model = CombinedModel()
    
    # predictions from your trained models
    pred_feat = feature_model.predict(test_feat_X)
    pred_emoticons = emoticon_model.predict(test_emoticon_X)
    # pred_text = text_model.predict(test_seq_X)
    # pred_combined = best_model.predict(test_feat_X, test_emoticon_X, test_seq_X)
    
    # saving prediction to text files
    save_predictions_to_file(pred_feat, "pred_feat.txt")
    save_predictions_to_file(pred_emoticons, "pred_emoticon.txt")
    # save_predictions_to_file(pred_text, "pred_text.txt")
    # save_predictions_to_file(pred_combined, "pred_combined.txt")
    




  X_test = X_test.applymap(replace_emoticon)
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [13]:
import pandas as pd
from collections import Counter

# Assuming test_emoticon_X is your DataFrame and 'split_emoticon' is the feature column
# Flatten the list of emoticons across all rows in the 'split_emoticon' column
all_emoticons = [emoticon for sublist in test_emoticon_X['split_emojis'] for emoticon in sublist]

# Use Counter to count the occurrences of each emoticon
emoticon_counter = Counter(all_emoticons)

# Find the most common emoticon
most_common_emoticon, count = emoticon_counter.most_common(1)[0]

# Output the most common emoticon and its count
print(f"The most common emoticon is: {most_common_emoticon}, appearing {count} times.")

The most common emoticon is: 😣, appearing 4464 times.
