<a href="https://colab.research.google.com/github/ahmedlabib02/Nlp-project/blob/main/Nlp_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Milestone 1

In [23]:
import os 
import json
import re
import pandas as pd
from collections import Counter
import string

### Data preparation

#### Unzipping the "B Hodoo2" channel folder

In [25]:
import zipfile
zip_path = "./B Hodoo2-20250226T182935Z-001.zip"
extract_path = "./"
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)


#### Unzipping the "Kefaya ba2a" channel folder

In [26]:
import zipfile
zip_path = "./Kefaya Ba2a-20250226T183527Z-001.zip"
extract_path = "./"
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

### Building dict for annotations and text

In [2]:
def standardize(title):
    
    title = title.replace("_", " ")
   
    title = title.lower()
    
    title = re.sub(r'[^\w\s]', '', title)
    
    # title = re.sub(r'\bبودكاست بهدوء مع كريم\b', '', title)
    # title = re.sub(r'\bجلسة\b', '', title)
    
    title = re.sub(r'\s+', ' ', title).strip()
    return title

In [4]:
def get_category_for_filename(target_filename, annotations_path):
    """
    Load annotations from annotations_path and try to match the
    canonicalized target_filename (without file extension) to the canonicalized
    title from each annotation. If a match is found, return the annotation's category.
    """
    with open(annotations_path, 'r', encoding='utf-8') as f_json:
        annotations = json.load(f_json)
    
    base_target = os.path.splitext(target_filename)[0]
    target_can = standardize(base_target)
    for entry in annotations:
        annotation_title = entry.get('title')
        if annotation_title:
            
            annotation_can = standardize(annotation_title)
            if annotation_can == target_can:
                # print("Matched annotation:", entry.get('title'))
                return entry.get('category')
    return None

In [5]:
def add_transcripts_from_folder(transcripts_dict, folder_path, channel):
    
    annotations_json = os.path.join(folder_path, 'annotations.json')
    raw_folder = os.path.join(folder_path, 'raw')
    for filename in os.listdir(raw_folder):
        full_path = os.path.join(raw_folder, filename)
        if os.path.isfile(full_path):
            with open(full_path, 'r', encoding='utf-8') as file:
                transcript = file.read()
                category = get_category_for_filename(filename, annotations_json)
            base_filename, _ = os.path.splitext(filename)
            transcripts_dict[base_filename] = (channel, category, transcript)
    return transcripts_dict

In [6]:
transcripts_dict = {}
first_folder_path = './B Hodoo2/'
second_folder_path = './Kefaya Ba2a/'

transcripts_dict = add_transcripts_from_folder(transcripts_dict, first_folder_path,  'B Hodoo2')
transcripts_dict = add_transcripts_from_folder(transcripts_dict, second_folder_path, 'Kefaya Ba2a')



### Creating a dataframe

In [None]:
data = [(title, info[0], info[1], info[2]) for title, info in transcripts_dict.items()]
df = pd.DataFrame(data, columns=['title', 'channel', 'category', 'transcript'])
print(df[0:1])


### Analysis

In [22]:
category_counts = Counter(df['category'])
print(category_counts)

Counter({'Education': 26, 'People & Blogs': 17, 'Comedy': 4})


**Since there are some entries without categories or an annotation file, we just remove them.**

In [None]:
df = df[df['category']!= None]

**Exploring dataset size**

In [33]:
print(df.shape)

(47, 4)


### Cleaning the data

In [None]:
def clean_text(text):
    punctuations = string.punctuation + "؟،؛«»…ـ"  
    translator = str.maketrans('', '', punctuations)
    text = text.translate(translator)  
    text = re.sub(r'[0-9A-Za-z]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
file_path = "B Hodoo2/raw\لماذا نيأس؟ عن فلسفة المثابرة _ بودكاست بهدوء مع كريم _ جلسة 20.txt" 
with open(file_path, "r", encoding="utf-8") as f:
    text = f.read()
print(clean_text(text))