In [None]:
import pandas as pd

df = pd.read_csv("tweets-train.csv",encoding="utf-8")

summ_texts = df["tweet"].dropna().astype(str).tolist()

corpus = " ".join(summ_texts)

with open("temp1.txt","w",encoding="utf-8") as f:
    f.write(corpus)
    
print("extraction done")

In [2]:
from pathlib import Path
import regex as re
from tqdm import tqdm

# ---------- CONFIG ----------
input_path = Path("temp1.txt")   # input file
output_path = Path("temp2.txt")  # output file

# ---------- COMPILE REGEX ----------
# Match only Marathi (Devanagari) words
marathi_pattern = re.compile(r'\p{Devanagari}+')
# Split text by full stop and optional spaces
sentence_splitter = re.compile(r'\.\s*')

# ---------- STEP 1: Read text ----------
text = input_path.read_text(encoding="utf-8")

# ---------- STEP 2: Split into sentences ----------
sentences = sentence_splitter.split(text)

cleaned_sentences = []

# ---------- STEP 3: Filter Marathi text efficiently ----------
for sentence in tqdm(sentences, desc="Cleaning sentences"):
    # Extract all Marathi sequences directly
    marathi_words = marathi_pattern.findall(sentence)
    if marathi_words:
        cleaned_sentences.append(" ".join(marathi_words))

# ---------- STEP 4: Write output ----------
output_path.write_text("\n".join(cleaned_sentences), encoding="utf-8")

print(f"✅ Done! Saved {len(cleaned_sentences)} clean Marathi sentences to '{output_path.name}'.")


Cleaning sentences: 100%|██████████| 45637/45637 [00:00<00:00, 259087.92it/s]

✅ Done! Saved 28189 clean Marathi sentences to 'temp2.txt'.





In [None]:
import time
text = "राम घरी जातो"

for i in range(10):
  # tokenize
  token_text = tokenizer.texts_to_sequences([text])[0]
  # padding
  padded_token_text = pad_sequences([token_text], maxlen=56, padding='pre')
  # predict
  pos = np.argmax(model.predict(padded_token_text))

  for word,index in tokenizer.word_index.items():
    if index == pos:
      text = text + " " + word
      print(text)
      

In [2]:
from tqdm import tqdm

input_file = "marathi_clean.txt"      # source file
output_file = "marathi_short.txt"     # destination file
min_words = 5
max_words = 15                        # threshold

def count_words(sentence):
    # Split on whitespace (handles multiple spaces)
    return len(sentence.strip().split())

with open(input_file, "r", encoding="utf-8") as infile, \
     open(output_file, "w", encoding="utf-8") as outfile:
    
    # Count total lines for tqdm progress bar
    total_lines = sum(1 for _ in open(input_file, "r", encoding="utf-8"))
    infile.seek(0)
    
    for line in tqdm(infile, total=total_lines, desc="Filtering sentences"):
        line = line.strip()
        if not line:
            continue  # skip empty lines
        
        if min_words <= count_words(line) <= max_words :
            outfile.write(line + "\n")

print(f"\n✅ Cleaned sentences saved to '{output_file}'")


Filtering sentences: 100%|██████████| 50724/50724 [00:00<00:00, 507747.71it/s]


✅ Cleaned sentences saved to 'marathi_short.txt'





In [3]:
from tqdm import tqdm

input_file = "marathi_clean.txt"     # Source file
output_file = "marathi_10.txt"       # Destination file
target_length = 10                   # Minimum and trim length

def process_sentence(sentence):
    words = sentence.strip().split()
    if len(words) < target_length:
        return None  # skip short sentences
    return " ".join(words[:target_length])  # trim to exactly 10 words

with open(input_file, "r", encoding="utf-8") as infile, \
     open(output_file, "w", encoding="utf-8") as outfile:

    total_lines = sum(1 for _ in open(input_file, "r", encoding="utf-8"))
    infile.seek(0)

    for line in tqdm(infile, total=total_lines, desc="Processing sentences"):
        line = line.strip()
        if not line:
            continue  # skip empty lines

        processed = process_sentence(line)
        if processed:
            outfile.write(processed + "\n")

print(f"\n✅ Sentences of exactly 10 words saved to '{output_file}'")


Processing sentences: 100%|██████████| 25852/25852 [00:00<00:00, 409382.69it/s]


✅ Sentences of exactly 10 words saved to 'marathi_10.txt'





In [3]:
input_file = "marathi_short.txt"
output_file = "temp.txt"

with open(input_file, "rb") as f:
    raw_data = f.read()

# Try decoding ignoring bad bytes
clean_text = raw_data.decode("utf-8", errors="ignore")

with open(output_file, "w", encoding="utf-8") as f:
    f.write(clean_text)

print("✅ Cleaned file saved as", output_file)


✅ Cleaned file saved as temp.txt
