# Machine Learning

### Input Data

In [2]:
import pandas as pd

In [3]:
df_input_parquet_pandas = pd.read_parquet('../data/input/input00.parquet')
display(df_input_parquet_pandas.head())

Unnamed: 0,CITY,HEADING,SECTION
0,chicago,Madden NFL 25 XBOX 360. Brand New!,for-sale
1,paris.en,looking for room to rent.,housing
2,newyork,two DS game,for-sale
3,seattle,map,housing
4,singapore,Good Looking Asian Sensation N aughty Girl ---...,services


### Training Data

In [4]:
df_training_parquet_pandas = pd.read_parquet('../data/input/training.parquet')
display(df_training_parquet_pandas.head())

Unnamed: 0,CATEGORY,CITY,HEADING,SECTION
0,cell-phones,newyork,New batteries C-S2 for Blackberry 7100/7130/87...,for-sale
1,cell-phones,newyork,******* Brand New Original SAMSUNG GALAXY NO...,for-sale
2,cell-phones,newyork,SAMSUNG GALAXY SIII T-999 MARBLE WHITE T-MOBIL...,for-sale
3,cell-phones,newyork,Ipad mini 64gb 4g any sim unlock,for-sale
4,cell-phones,newyork,htc evo 4g lte for trade,for-sale


In [57]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# Read data
data = pd.read_parquet('../data/input/training.parquet')
data_input = pd.read_parquet('../data/input/input00.parquet')

# Define features function
features = lambda d: d['HEADING'] + d['SECTION']

# Create pipeline with LinearSVC(dual=True)
pipe = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', LinearSVC(dual=True))])

# Train the model
words = [features(row) for _, row in data.iterrows()]
classification = data['CATEGORY'].tolist()
model = pipe.fit(words, classification)

# Make predictions
words_input = [features(row) for _, row in data_input.iterrows()]
predictions = model.predict(words_input)

# Print predictions
print(*predictions, sep='\n')

# Calculate and print accuracy
accuracy = accuracy_score(data['CATEGORY'], model.predict(words))
print(f"Accuracy: {accuracy}")


video-games
wanted-housing
video-games
shared
therapeutic
appliances
appliances
therapeutic
therapeutic
artists
cell-phones
shared
shared
shared
shared
cell-phones
automotive
cell-phones
childcare
household-services
childcare
appliances
housing
automotive
therapeutic
temporary
therapeutic
wanted-housing
appliances
wanted-housing
childcare
shared
childcare
automotive
childcare
shared
wanted-housing
automotive
appliances
automotive
childcare
household-services
therapeutic
artists
cell-phones
video-games
wanted-housing
automotive
household-services
cell-phones
general
real-estate
photography
appliances
artists
shared
shared
therapeutic
temporary
appliances
cell-phones
household-services
video-games
general
household-services
photography
shared
shared
real-estate
video-games
appliances
therapeutic
shared
temporary
appliances
therapeutic
childcare
appliances
real-estate
appliances
shared
therapeutic
cell-phones
wanted-housing
household-services
cell-phones
shared
shared
photography
real-est

In [62]:
import numpy as np

# ... (rest of the code remains the same)

# Train the model
words = [features(row) for _, row in data.iterrows()]
classification = data['CATEGORY'].tolist()
model = pipe.fit(words, classification)

# Extract feature names from CountVectorizer
vectorizer = model.named_steps['vect']
feature_names = vectorizer.get_feature_names_out()  # scikit-learn 1.0 and later

# Get the weights from LinearSVC model
weights = model.named_steps['clf'].coef_

# Generate a0 and a1 as the most important words associated with each class label
a0 = feature_names[np.argmax(weights[0])]  # most important word for class label 0
a1 = feature_names[np.argmax(weights[1])]  # most important word for class label 1
print("Most important words:")
print(f"a0: {a0}")  # word associated with class label 0
print(f"a1: {a1}")  # word associated with class label 1



Most important words:
a0: tennis
a1: sale


In [50]:
df_predictions = pd.DataFrame(predictions)

# Save the predictions to a CSV file
df_predictions.to_csv('../data/output/svm_predictions.csv', index=False, header=False)

In [51]:
# Read the contents of output00.txt
with open('../data/output/output00.txt', 'r') as file:
    expected_results = file.read().splitlines()

# Compare the expected results with the predicted results
for i, (expected, predicted) in enumerate(zip(expected_results, predictions)):
    if expected == predicted:
        print(f"Row {i}: Expected: {expected}, Predicted: {predicted}, Match: True")
    else:
        print(f"Row {i}: Expected: {expected}, Predicted: {predicted}, Match: False")


Row 0: Expected: video-games, Predicted: video-games, Match: True
Row 1: Expected: wanted-housing, Predicted: wanted-housing, Match: True
Row 2: Expected: video-games, Predicted: video-games, Match: True
Row 3: Expected: shared, Predicted: shared, Match: True
Row 4: Expected: therapeutic, Predicted: therapeutic, Match: True
Row 5: Expected: video-games, Predicted: appliances, Match: False
Row 6: Expected: appliances, Predicted: appliances, Match: True
Row 7: Expected: therapeutic, Predicted: therapeutic, Match: True
Row 8: Expected: therapeutic, Predicted: therapeutic, Match: True
Row 9: Expected: artists, Predicted: artists, Match: True
Row 10: Expected: cell-phones, Predicted: cell-phones, Match: True
Row 11: Expected: general, Predicted: shared, Match: False
Row 12: Expected: wanted-housing, Predicted: shared, Match: False
Row 13: Expected: shared, Predicted: shared, Match: True
Row 14: Expected: wanted-housing, Predicted: shared, Match: False
Row 15: Expected: cell-phones, Predicte

In [52]:
# Read the contents of output00.txt
with open('../data/output/output00.txt', 'r') as file:
    expected_results = file.read().splitlines()

# Initialize counts
true_count = 0
false_count = 0

# Compare the expected results with the predicted results
for i, (expected, predicted) in enumerate(zip(expected_results, predictions)):
    if expected == predicted:
        true_count += 1
    else:
        false_count += 1

print(f"True count: {true_count}")
print(f"False count: {false_count}")


True count: 12466
False count: 2904


In [53]:
# Read the expected results from the file
with open('../data/output/output00.txt', 'r') as file:
    expected_results = file.read().splitlines()

# Compare the expected results with the predicted results
results = []
for i, (expected, predicted) in enumerate(zip(expected_results, predictions)):
    match = expected == predicted
    results.append({'Row': i, 'Expected': expected, 'Predicted': predicted, 'Match': match})

# Create a DataFrame from the results
df_results = pd.DataFrame(results, index=None)

# Print the DataFrame
print(df_results.to_string(index=False))

  Row           Expected          Predicted  Match
    0        video-games        video-games   True
    1     wanted-housing     wanted-housing   True
    2        video-games        video-games   True
    3             shared             shared   True
    4        therapeutic        therapeutic   True
    5        video-games         appliances  False
    6         appliances         appliances   True
    7        therapeutic        therapeutic   True
    8        therapeutic        therapeutic   True
    9            artists            artists   True
   10        cell-phones        cell-phones   True
   11            general             shared  False
   12     wanted-housing             shared  False
   13             shared             shared   True
   14     wanted-housing             shared  False
   15        cell-phones        cell-phones   True
   16         automotive         automotive   True
   17        cell-phones        cell-phones   True
   18          childcare       

In [54]:
# Count the total of True and Total of False
match_counts = df_results['Match'].value_counts()

# Print the counts
print("Total of True:", match_counts.get(True, 0))
print("Total of False:", match_counts.get(False, 0))

Total of True: 12466
Total of False: 2904


In [55]:
# Calculate the accuracy
accuracy = match_counts.get(True, 0) / len(df_results)
print("Accuracy:", accuracy)

Accuracy: 0.811060507482108


In [56]:
# Save the results to a CSV file
df_results.to_csv('../data/output/svm_results.csv', index=False)


In [ ]:
# Save the accuracy to a text file