<a href="https://colab.research.google.com/github/abhinav420y/ZAF034_BA_Anti-Discrimination/blob/Vaibhavi15-04-patch-1/Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Data Loading

Code to create Sample Database if yourn't able to use the csv file.

In [None]:
import pandas as pd
import numpy as np

# Set the random seed for reproducibility
np.random.seed(42)

# Generate good reviews
good_reviews = [
    "The flight was great! The cabin crew were friendly and helpful.",
    "I had a fantastic experience with this airline. The food was delicious and the seats were comfortable.",
    "I really enjoyed my flight. The in-flight entertainment was top-notch.",
    "The flight was on time and everything went smoothly. I would definitely fly with this airline again.",
    "I had a wonderful experience with this airline. The staff were professional and attentive.",
    "The flight was excellent. The cabin crew went above and beyond to make sure I was comfortable.",
    "I was very impressed with this airline. The service was exceptional and the amenities were great.",
    "The flight was perfect. I couldn't have asked for more.",
    "I had a great time on my flight. The staff were friendly and the atmosphere was pleasant.",
    "The flight was amazing. I felt well taken care of and the entire experience was enjoyable."
]

# Generate bad reviews
bad_reviews = [
    "The flight was terrible. The seats were uncomfortable and the staff were rude.",
    "I had a horrible experience with this airline. The flight was delayed and the service was terrible.",
    "The flight was a nightmare. The food was terrible and the staff were unhelpful.",
    "I would not recommend this airline. The flight was overbooked and I was bumped from my seat.",
    "The flight was a disaster. My luggage was lost and the staff were uncooperative.",
    "I had a terrible experience with this airline. The flight was delayed and the staff were incompetent.",
    "The flight was a disappointment. The cabin was dirty and the service was subpar.",
    "I was not satisfied with my flight. The staff were unfriendly and the amenities were lacking.",
    "The flight was a letdown. The seats were uncomfortable and the food was inedible.",
    "I had a miserable experience with this airline. The flight was uncomfortable and the staff were unprofessional."
]

# Generate discriminative feedbacks
discriminative_feedbacks = [
    "I felt like I was treated differently because of my race.",
    "The airline staff made some inappropriate comments about my gender identity.",
    "The staff were dismissive of my disability and did not provide appropriate accommodations.",
    "I was subjected to racial profiling by the airline staff.",
    "I felt unsafe on my flight due to the discriminatory behavior of the cabin crew.",
    "The airline staff made insensitive remarks about my religious beliefs.",
    "I was discriminated against by the airline staff based on my sexual orientation.",
    "The airline staff treated me unfairly because of my age.",
    "I experienced discrimination on my flight due to my physical appearance.",
    "The airline staff were biased against me because of my nationality."
]




In [None]:
import random
import csv

# Shuffle the good and bad reviews
random.shuffle(good_reviews)
random.shuffle(bad_reviews)

# Create a list of feedbacks with a mix of good and bad reviews, along with some discriminative feedbacks
feedbacks = []
for i in range(400):
    feedbacks.append([good_reviews[i % len(good_reviews)], "positive"])
    feedbacks.append([bad_reviews[i % len(bad_reviews)], "negative"])
for i in range(200):
    feedbacks.append([discriminative_feedbacks[i % len(discriminative_feedbacks)], "discriminative"])

# Shuffle the feedbacks again
random.shuffle(feedbacks)

# Write the feedbacks to a CSV file
with open("airline_feedback_data.csv", "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["feedback", "sentiment"])
    for feedback in feedbacks:
        writer.writerow(feedback)


In [None]:
import pandas as pd
df = pd.read_csv("airline_feedback_data.csv")

Data Preprocessing

In [None]:
df.head()

Unnamed: 0,feedback,sentiment
0,I really enjoyed my flight. The in-flight ente...,positive
1,I was not satisfied with my flight. The staff ...,negative
2,I really enjoyed my flight. The in-flight ente...,positive
3,The flight was excellent. The cabin crew went ...,positive
4,I really enjoyed my flight. The in-flight ente...,positive


In [None]:
df['sentiment']=='discriminative'

0      False
1      False
2      False
3      False
4      False
       ...  
995     True
996    False
997     True
998    False
999    False
Name: sentiment, Length: 1000, dtype: bool

In [None]:
df.shape

(1000, 2)

In [None]:
df.tail()

Unnamed: 0,feedback,sentiment
995,The airline staff were biased against me becau...,discriminative
996,The flight was perfect. I couldn't have asked ...,positive
997,I felt like I was treated differently because ...,discriminative
998,The flight was great! The cabin crew were frie...,positive
999,The flight was a letdown. The seats were uncom...,negative


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [6]:
# Load the data
df = pd.read_csv("airline_feedback_data.csv")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df["feedback"], df["sentiment"], test_size=0.2, random_state=42)

In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [10]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [8]:
# Tokenize and lemmatize the text data
lemmatizer = WordNetLemmatizer()
X_train_tokens = []
for text in X_train:
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens]
    X_train_tokens.append(' '.join(lemmatized_tokens))


In [9]:
X_test_tokens = []
for text in X_test:
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens]
    X_test_tokens.append(' '.join(lemmatized_tokens))

In [None]:
# Vectorize the text data
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


Model

In [None]:

# Train a Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train_vec, y_train)

In [None]:
# Make predictions on the test data
y_pred = model.predict(X_test_vec)
print(y_pred)

['discriminative' 'positive' 'positive' 'negative' 'negative' 'positive'
 'negative' 'negative' 'discriminative' 'positive' 'negative' 'positive'
 'discriminative' 'positive' 'positive' 'positive' 'positive' 'negative'
 'positive' 'negative' 'discriminative' 'negative' 'discriminative'
 'positive' 'negative' 'positive' 'negative' 'positive' 'negative'
 'negative' 'discriminative' 'positive' 'discriminative' 'positive'
 'negative' 'discriminative' 'negative' 'negative' 'negative' 'positive'
 'positive' 'negative' 'negative' 'discriminative' 'positive' 'negative'
 'positive' 'discriminative' 'positive' 'negative' 'positive' 'positive'
 'positive' 'negative' 'positive' 'positive' 'positive' 'discriminative'
 'negative' 'positive' 'negative' 'positive' 'negative' 'negative'
 'discriminative' 'positive' 'negative' 'positive' 'discriminative'
 'negative' 'positive' 'discriminative' 'positive' 'negative' 'positive'
 'negative' 'positive' 'positive' 'negative' 'negative' 'positive'
 'discrimin

In [None]:
print(classification_report(y_test, y_pred))

                precision    recall  f1-score   support

discriminative       1.00      1.00      1.00        50
      negative       1.00      1.00      1.00        68
      positive       1.00      1.00      1.00        82

      accuracy                           1.00       200
     macro avg       1.00      1.00      1.00       200
  weighted avg       1.00      1.00      1.00       200



Saving A Model

In [None]:
import h5py
from sklearn.naive_bayes import MultinomialNB


# Save the model in HDF5 format
with h5py.File("model.h5", "w") as f:
    # Create a group to store the model
    group = f.create_group("model")

    # Store the model parameters
    group.create_dataset("alpha", data=model.alpha)
    group.create_dataset("class_count", data=model.class_count_)
    group.create_dataset("feature_count", data=model.feature_count_)
 #   group.create_dataset("theta", data=model.theta_)

    # Store the classes
    classes = [str(c) for c in model.classes_]
    group.create_dataset("classes", data=classes)
