In [9]:
cd /content/drive/MyDrive/CapeStart_TextClassification_Task

/content/drive/MyDrive/CapeStart_TextClassification_Task


In [10]:
!pwd

/content/drive/MyDrive/CapeStart_TextClassification_Task


In [18]:
!ls

articles.csv  Text_Classification.ipynb  unknown_articles.csv


# Import

In [66]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [79]:
# Read CSV File
df = pd.read_csv('articles.csv', encoding='latin-1')
df.head(5)

Unnamed: 0,Id,Heading,Article.Banner.Image,Outlets,Article.Description,Full_Article,Article_Type,Tonality
0,d6995462-5e87-453b-b64d-e9f1df6e94d2,"A Puzzling Maneuver, Then Freefall: NTSB Repor...",,Essex Caller,<p>The helicopter that crashed in Southeast Al...,<p>The helicopter that crashed in Southeast Al...,Commercial,Negative
1,8b05e939-a89e-4548-b92b-013822e8ee7d,Bells Nexus Air Taxi Concept Rings Changes Fo...,,Aviation Week Network,<p>A year after teasing the fledgling electric...,<p>A year after teasing the fledgling electric...,Commercial,Positive
2,69fcd400-bceb-4255-8277-619f2d68ac0b,Bell Helicopter Show Air Taxi Nexus,http://images.tmtpost.com/uploads/images/2019/...,TMTPost,<p>Bell released the full-size design of the v...,<p>Bell released the full-size design of the v...,Commercial,Positive
3,17943578-c11b-414b-b3f5-063d3a93157b,BELL DÉVOILE LA CONCEPTION INTÉGRALE DE SON TA...,http://www.fredzone.org/wp-content/uploads/201...,Fredzone,<p>Bell est une soci&eacute;t&eacute; am&eacut...,<p>Bell est une soci&eacute;t&eacute; am&eacut...,Commercial,Positive
4,f33c7b11-5f77-4a98-bb2e-d36689042aea,Les premiers retours dOlivier Ezratty,,FrenchWeb,<p>It was still anecdotal to observe the explo...,<p>It was still anecdotal to observe the explo...,Commercial,Positive


In [80]:
# # Preprocessing and cleaning
# df.dropna(inplace=True)

In [81]:
# Encode Article type
label_encoder = LabelEncoder()
df['Article_Type'] = label_encoder.fit_transform(df['Article_Type'])

In [82]:
# Split-> Train and Test
X = df['Full_Article']
y = df['Article_Type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [85]:
# !pip install sentence-transformers

In [86]:
# vectorization using sentenceBERT
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [87]:
X_train = X_train.reset_index(drop=True)
X_train_embeddings = model.encode(X_train)
X_test = X_test.reset_index(drop=True)
X_test_embeddings = model.encode(X_test)

In [89]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
classifier.fit(X_train_embeddings, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [90]:
classifier.classes_

array([0, 1, 2, 3, 4, 5, 6])

In [91]:
label_encoder.classes_

array(['Commercial', 'Executives', 'Financing', 'Military', 'Others',
       'Support & Services', 'Training'], dtype=object)

# Performance Metrics:

In [94]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = classifier.predict(X_test_embeddings)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_[:6])

print(f'Accuracy: {accuracy}')
print(report)


Accuracy: 0.9001161440185831
                    precision    recall  f1-score   support

        Commercial       0.92      0.92      0.92       515
        Executives       1.00      0.90      0.95        10
         Financing       0.00      0.00      0.00         3
          Military       0.88      0.90      0.89       317
            Others       0.60      0.38      0.46         8
Support & Services       0.75      0.38      0.50         8

          accuracy                           0.90       861
         macro avg       0.69      0.58      0.62       861
      weighted avg       0.90      0.90      0.90       861



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [98]:
!ls

article_classifier_model.pkl  articles.csv  Text_Classification.ipynb  unknown_articles.csv


# save and reload the model

In [99]:
import joblib

# Save the model
joblib.dump(classifier, '/content/drive/MyDrive/CapeStart_TextClassification_Task/article_classifier_model.pkl')

# Load the model
loaded_model = joblib.load('/content/drive/MyDrive/CapeStart_TextClassification_Task/article_classifier_model.pkl')


# Create an API Endpoint

In [102]:
from flask import Flask, request, jsonify

app = Flask(__name__)

@app.route('/classify', methods=['POST'])
def classify_article():
    data = request.json  # Assuming you send article text as JSON
    embedding = model.encode(data['text'])
    predicted_class = loaded_model.predict([embedding])[0]
    class_label = label_encoder.inverse_transform([predicted_class])[0]

    return jsonify({'predicted_class': class_label})

if __name__ == '__main__':
    app.run()


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


# Extract Heading and Full_Article

In [119]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Load the CSV containing URLs
unknown_articles_df = pd.read_csv('unknown_articles.csv')
unknown_articles_df.head(5)

Unnamed: 0,Article.URL
0,http://australianaviation.com.au/2018/10/a-com...
1,http://australianaviation.com.au/2018/10/victo...
2,http://australianaviation.com.au/2018/10/army-...
3,https://attain.news/community/special-sea-king...
4,https://m.ariva.de/amp/ad-hoc-airbus-board-of-...


In [120]:
# Becaue the 14 row url is not working so i remove it separately
row_15 = unknown_articles_df.iloc[14]  # Indexing is zero-based, so 14 corresponds to the 15th row
print(row_15)

Article.URL    https://www.newsoneplace.com/4085271809/army-a...
Name: 14, dtype: object


In [122]:
unknown_articles_df = unknown_articles_df.drop(14)

In [125]:
# headings = []
# full_articles = []

# count = 0

# for url in unknown_articles_df['Article.URL']:
#     response = requests.get(url)
#     print(f"count: {count} -> response.status_code: {response.status_code}")
#     count += 1
#     if response.status_code == 200:
#         soup = BeautifulSoup(response.text, 'html.parser')
#         h1_element = soup.find('h1')  # Find the <h1> element
#         if h1_element:
#             heading = h1_element.text  # Get the text if <h1> is found
#         else:
#             heading = ''  # Set a default value if <h1> is not found
#         article = ' '.join([p.text for p in soup.find_all('p')])  # Concatenate all paragraphs
#         headings.append(heading)
#         full_articles.append(article)
#     else:
#         headings.append('')
#         full_articles.append('')

# unknown_articles_df['Headings'] = headings
# unknown_articles_df['Full_Article'] = full_articles


# Predict

In [None]:
# Preprocess and clean the extracted data as in step 1
X_unknown = unknown_articles_df['Full_Article']  # Extracted full articles
X_unknown_embeddings = model.encode(X_unknown)

# Predict Article_Type
predicted_classes = loaded_model.predict(X_unknown_embeddings)
predicted_labels = label_encoder.inverse_transform(predicted_classes)

# Add the predicted Article_Type to the DataFrame
unknown_articles_df['Predicted_Article_Type'] = predicted_labels

# Save the results to a new CSV file
unknown_articles_df.to_csv('/content/drive/MyDrive/CapeStart_TextClassification_Task/predicted_unknown_articles.csv', index=False)