In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import pandas as pd

# Read data
data = pd.read_parquet('../data/input/training.parquet')
data_input = pd.read_parquet('../data/input/input00.parquet')

# Define features function
features = lambda d: d['HEADING'] + d['SECTION']

# Create pipeline with RandomForestClassifier
pipe = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', RandomForestClassifier())])

# Train the model
words = [features(row) for _, row in data.iterrows()]
classification = data['CATEGORY'].tolist()
model = pipe.fit(words, classification)

# Make predictions
words_input = [features(row) for _, row in data_input.iterrows()]
predictions = model.predict(words_input)

# Print predictions
print(*predictions, sep='\n')

# Calculate and print accuracy
accuracy = accuracy_score(data['CATEGORY'], model.predict(words))
print(f"Accuracy: {accuracy}")


video-games
wanted-housing
video-games
shared
therapeutic
appliances
appliances
therapeutic
therapeutic
artists
cell-phones
wanted-housing
wanted-housing
shared
temporary
cell-phones
automotive
cell-phones
childcare
household-services
childcare
appliances
housing
automotive
therapeutic
shared
therapeutic
wanted-housing
appliances
wanted-housing
childcare
shared
childcare
automotive
childcare
shared
wanted-housing
household-services
appliances
automotive
childcare
household-services
therapeutic
general
cell-phones
video-games
wanted-housing
real-estate
household-services
cell-phones
general
real-estate
photography
appliances
artists
temporary
shared
therapeutic
temporary
appliances
cell-phones
automotive
video-games
general
general
photography
shared
shared
real-estate
video-games
appliances
therapeutic
shared
temporary
appliances
therapeutic
childcare
appliances
real-estate
appliances
wanted-housing
therapeutic
cell-phones
wanted-housing
household-services
cell-phones
shared
shared
pho

In [2]:
df_predictions = pd.DataFrame(predictions)

# Save the predictions to a CSV file
df_predictions.to_csv('../data/output/random_forest_predictions.csv', index=False, header=False)

In [3]:
# Read the expected results from the file
with open('../data/output/output00.txt', 'r') as file:
    expected_results = file.read().splitlines()

# Compare the expected results with the predicted results
results = []
for i, (expected, predicted) in enumerate(zip(expected_results, predictions)):
    match = expected == predicted
    results.append({'Row': i, 'Expected': expected, 'Predicted': predicted, 'Match': match})

# Create a DataFrame from the results
df_results = pd.DataFrame(results, index=None)

# Print the DataFrame
print(df_results.to_string(index=False))

  Row           Expected          Predicted  Match
    0        video-games        video-games   True
    1     wanted-housing     wanted-housing   True
    2        video-games        video-games   True
    3             shared             shared   True
    4        therapeutic        therapeutic   True
    5        video-games         appliances  False
    6         appliances         appliances   True
    7        therapeutic        therapeutic   True
    8        therapeutic        therapeutic   True
    9            artists            artists   True
   10        cell-phones        cell-phones   True
   11            general     wanted-housing  False
   12     wanted-housing     wanted-housing   True
   13             shared             shared   True
   14     wanted-housing          temporary  False
   15        cell-phones        cell-phones   True
   16         automotive         automotive   True
   17        cell-phones        cell-phones   True
   18          childcare       

In [4]:
# Count the total of True and Total of False
match_counts = df_results['Match'].value_counts()

# Print the counts
print("Total of True:", match_counts.get(True, 0))
print("Total of False:", match_counts.get(False, 0))

Total of True: 12021
Total of False: 3349


In [5]:
# Calculate the accuracy
accuracy = match_counts.get(True, 0) / len(df_results)
print("Accuracy:", accuracy)

Accuracy: 0.7821080026024724


In [6]:
# Save the results to a CSV file
df_results.to_csv('../data/output/random_forest_results.csv', index=False)