# **Import Json File From Drive**

In [None]:
from google.colab import drive

drive.mount('/content/drive')


In [None]:
%cd /content/drive/My Drive/DMKD

# **Import All libraries**

In [None]:
import json
import csv
import sklearn
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# **Convert Json to CSV**

In [None]:


# Open the JSON file
with open('/content/drive/MyDrive/DMKD/test.json', 'r') as json_file:
    # Load the JSON data
    data = json.load(json_file)

# Open the CSV file
with open('output.csv', 'w', newline='') as csv_file:
    # Create a CSV writer
    writer = csv.writer(csv_file)

    # Write the headers
    writer.writerow(data[0].keys())

    # Write the data rows
    for row in data:
        writer.writerow(row.values())


In [None]:
df = pd.read_csv('/content/drive/MyDrive/DMKD/output.csv')
df.head()

# **Pre-Peocessed Data**

In [None]:
# Read the CSV file into a Pandas DataFrame
df = pd.read_csv('/content/output.csv')

# Extract the 'text' column from the DataFrame
texts = df['post']

# Initialize the NLTK tokenizer
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')

# Initialize a list to store the preprocessed texts
processed_texts = []

# Initialize a list of stopwords to remove
stop_words = stopwords.words('english')

# Initialize the Porter stemmer
stemmer = PorterStemmer()

# Iterate through the texts
for post in texts:
    # Tokenize the text
    tokens = tokenizer.tokenize(str(post))
    
    # Remove punctuation and stopwords
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words and token.isalpha()]
    
    # Stem the remaining tokens
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    
    # Join the tokens back into a single string
    processed_text = ' '.join(stemmed_tokens)
    
    # Add the processed text to the list
    processed_texts.append(processed_text)

# Add the processed texts to the DataFrame as a new column
df['processed_text'] = processed_texts

# Save the modified DataFrame to a new CSV file
df.to_csv('processed_texts.csv', index=False)


In [None]:
# Load the CSV file
df = pd.read_csv('/content/drive/MyDrive/DMKD/processed_texts.csv')

# Remove rows with missing data
df = df.dropna()

# Remove duplicate rows
df = df.drop_duplicates()

# Remove post columns
df = df.drop(columns=['post'])

# Save the cleaned data to a new CSV file
df.to_csv('processed_texts.csv', index=False)

In [None]:
df = pd.read_csv('/content/drive/MyDrive/DMKD/processed_texts.csv')
df.head()

In [None]:
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv('/content/drive/MyDrive/DMKD/processed_texts.csv')

# View the first few rows of the data
print(df.head())

# Get some basic statistics about the data
print(df.describe())

# Get information about the data types and number of non-null values in each column
print(df.info())

# **Support Vector Model**

In [None]:
# Load the data from the CSV file into a DataFrame
df = pd.read_csv('/content/drive/MyDrive/DMKD/processed_texts.csv')

# Extract the text data and labels
X = df['processed_text']
y = df['gender']

# Extract features from the text data using a CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Train a Suport Vector model
model = SVC()
model.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = model.predict(X_test)

# Calculate the test set accuracy
accuracy = np.mean(y_pred == y_test)
print('Test accuracy:', accuracy)


# **Logistic Regression Model**

In [None]:

# Load the data from the CSV file into a DataFrame
df = pd.read_csv('/content/drive/MyDrive/DMKD/processed_texts.csv')

# Extract the text data and labels
X = df['processed_text']
y = df['gender']

# Extract features from the text data using a CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.1, 1, 10], 'penalty': ['l1', 'l2']}
grid = GridSearchCV(LogisticRegression(), param_grid, verbose=2)
grid.fit(X_train, y_train)
print(grid.best_params_)

# Train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = model.predict(X_test)

# Calculate the test set accuracy
accuracy = np.mean(y_pred == y_test)
print('Test accuracy:', accuracy)


# **Decision Tree Classifier Model**

In [None]:
# Load the data from the CSV file into a DataFrame
df = pd.read_csv('/content/drive/MyDrive/DMKD/processed_texts.csv')

# Extract the text data and labels
X = df['processed_text']
y = df['gender']

# Extract features from the text data using a CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Train a Decision Tree Classifier model
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = model.predict(X_test)

# Calculate the test set accuracy
accuracy = np.mean(y_pred == y_test)
print('Test accuracy:', accuracy)


# **Naive Bayes Model**

In [None]:
# Load the data from the CSV file into a DataFrame
df = pd.read_csv('/content/drive/MyDrive/DMKD/processed_texts.csv')

# Extract the text data and labels
X = df['processed_text']
y = df['gender']

# Extract features from the text data using a CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Train a Naive Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = model.predict(X_test)

# Calculate the test set accuracy
accuracy = np.mean(y_pred == y_test)
print('Test accuracy:', accuracy)