# Morphological Analyzer

## Import python packages

In [3]:
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
import nltk
from tensorflow.keras.preprocessing.text import Tokenizer 

# Suppress TensorFlow warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # Only show ERROR messages

## Read Dataset

In [4]:
df = pd.read_excel("./dataset/complete words.xlsx")
df.head()

Unnamed: 0,Name,Surface,Deep
0,نہا,نہاوندیاں,"نہا , V , Hab , Fem , Pl , 1P"
1,نہا,نہاندیاں,"نہا,V , Hab , Fem , Pl , 2P"
2,نہا,نہاوندیاں,"نہا ,V , Hab , Fem , Pl , 2P"
3,نہا,نہا,"نہا,V , Comd , Sg , 2P , Hon1"
4,نہا,نہائیں,"نہا ,V , Comd , Sg , 2P , Hon1"


## Dataframe Shape

In [5]:
df.shape

(151884, 3)

## Dataframe information

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151884 entries, 0 to 151883
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   Name     151876 non-null  object
 1   Surface  151882 non-null  object
 2   Deep     151884 non-null  object
dtypes: object(3)
memory usage: 3.5+ MB


## Drop null values and prepare features, labels and display

In [7]:
# Columns are named 'Name', 'Surface', 'Deep'
df.columns = ['Name', 'Surface', 'Deep']

# Drop duplicates and NaN values
df = df.drop_duplicates()
df = df.dropna()

# Prepare features and labels
X_surface = df['Surface']
y_deep = df['Deep']

# Display the first few entries
X_surface.head()

0    نہاوندیاں
1     نہاندیاں
2    نہاوندیاں
3          نہا
4       نہائیں
Name: Surface, dtype: object

In [8]:
y_deep.head()

0     نہا , V , Hab , Fem , Pl , 1P
1       نہا,V , Hab , Fem , Pl , 2P
2      نہا ,V , Hab , Fem , Pl , 2P
3     نہا,V , Comd , Sg , 2P , Hon1
4    نہا ,V , Comd , Sg , 2P , Hon1
Name: Deep, dtype: object

## Tokenizing and Padding Surface and Deep Representations

In [9]:
# Tokenizing the surface words
tokenizer_surface = Tokenizer()
tokenizer_surface.fit_on_texts(X_surface)
X_surface_seq = tokenizer_surface.texts_to_sequences(X_surface)

# Padding sequences for surface words
max_surface_len = max(len(x) for x in X_surface_seq)
X_surface_pad = pad_sequences(X_surface_seq, maxlen=max_surface_len)

# Tokenizing the deep representations
tokenizer_deep = Tokenizer()
tokenizer_deep.fit_on_texts(y_deep)
y_deep_seq = tokenizer_deep.texts_to_sequences(y_deep)
y_deep_pad = pad_sequences(y_deep_seq, maxlen=1)  # Each surface has a single deep representation

## Train Test Split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_surface_pad, y_deep_pad, test_size=0.2, random_state=42)

print(f'Training data shape: {X_train.shape}, {y_train.shape}')
print(f'Test data shape: {X_test.shape}, {y_test.shape}')

Training data shape: (101586, 2), (101586, 1)
Test data shape: (25397, 2), (25397, 1)


## Morphological Analysis

In [11]:
# Updated function to handle missing values
def analyze_morphology(row):
    surface_form = row['Surface']
    deep_structure = row['Deep']
    
    # Split the deep structure by commas and strip spaces
    deep_tags = [tag.strip() for tag in deep_structure.split(',')]
    
    # Initialize the analysis with default values in case tags are missing
    analysis = {
        'Root': deep_tags[0] if len(deep_tags) > 0 else 'Unknown',   # Default to 'Unknown' if missing
        'POS': deep_tags[1] if len(deep_tags) > 1 else 'Unknown',
        'Tense': deep_tags[2] if len(deep_tags) > 2 else 'Unknown',
        'Gender': deep_tags[3] if len(deep_tags) > 3 else 'Unknown',
        'Number': deep_tags[4] if len(deep_tags) > 4 else 'Unknown',
        'Person': deep_tags[5] if len(deep_tags) > 5 else 'Unknown'
    }
    
    return analysis

# Apply the morphological analysis
df['Analysis'] = df.apply(analyze_morphology, axis=1)

# Print the results
print(df[['Surface', 'Analysis']])

            Surface                                           Analysis
0         نہاوندیاں  {'Root': 'نہا', 'POS': 'V', 'Tense': 'Hab', 'G...
1          نہاندیاں  {'Root': 'نہا', 'POS': 'V', 'Tense': 'Hab', 'G...
2         نہاوندیاں  {'Root': 'نہا', 'POS': 'V', 'Tense': 'Hab', 'G...
3               نہا  {'Root': 'نہا', 'POS': 'V', 'Tense': 'Comd', '...
4            نہائیں  {'Root': 'نہا', 'POS': 'V', 'Tense': 'Comd', '...
...             ...                                                ...
151372      چھوٹانے  {'Root': 'چھوٹا', 'POS': 'V + Inf + Mesc + Pl'...
151477  مسکراوندہنے  {'Root': 'مسکراوندہ', 'POS': 'V + Inf + Mesc +...
151582      ملائمنے  {'Root': 'ملائم', 'POS': 'V + Inf + Mesc + Pl'...
151687       کھاسنے  {'Root': 'کھاس', 'POS': 'V + Inf + Mesc + Pl',...
151792      چپچپانے  {'Root': 'چپچپا', 'POS': 'V + Inf + Mesc + Pl'...

[126983 rows x 2 columns]


## Analyze New Word

In [12]:
# Rule-based morphological analysis for a single new word
def analyze_new_word(new_word, df):
    # Look up the word in the existing dataset
    result = df[df['Surface'] == new_word]
    
    if not result.empty:
        # If the word is found, return the existing analysis
        return result['Analysis'].values[0]
    else:
        # If the word is not found, return a default message or further processing
        return "Word not found in the dataset. Unable to predict."

# Example usage
new_word = 'نہاوندیاں'
analysis_result = analyze_new_word(new_word, df)
print(f"Analysis for '{new_word}': {analysis_result}")

Analysis for 'نہاوندیاں': {'Root': 'نہا', 'POS': 'V', 'Tense': 'Hab', 'Gender': 'Fem', 'Number': 'Pl', 'Person': '1P'}


## Rule-based morphological analysis

In [13]:
# Rule-based morphological analysis for a single word
def analyze_new_word(new_word, df):
    # Look up the word in the existing dataset
    result = df[df['Surface'] == new_word]
    
    if not result.empty:
        # Split the analysis components from the 'Deep' column
        deep_analysis = result['Deep'].values[0].split(',')
        
        # Order in 'Deep' column corresponds to:
        # [Root, POS, Tense, Gender, Number, Person]
        analysis_dict = {
            'Root': deep_analysis[0].strip(),
            'POS': deep_analysis[1].strip(),
            'Tense': deep_analysis[2].strip(),
            'Gender': deep_analysis[3].strip(),
            'Number': deep_analysis[4].strip(),
            'Person': deep_analysis[5].strip()
        }
        
        return analysis_dict
    else:
        # If the word is not found, return a default message
        return "Word not found in the dataset. Unable to predict."

# Example usage with a new word
new_word = 'نہائیں'
analysis_result = analyze_new_word(new_word, df)
print(f"Analysis for '{new_word}': {analysis_result}")

Analysis for 'نہائیں': {'Root': 'نہا', 'POS': 'V', 'Tense': 'Comd', 'Gender': 'Sg', 'Number': '2P', 'Person': 'Hon1'}


## Finite State Approach: Tokenization, Lexical, Syntax, and Semantic Analysis

In [14]:
# Step 1: Tokenization using Keras
def tokenize_text(input_text):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts([input_text])
    tokens = tokenizer.texts_to_sequences([input_text])[0]
    return tokens, tokenizer.word_index

# Step 2: Lexical Analysis
def lexical_analysis(tokens, word_index):
    indexed_words = {index: word for word, index in word_index.items()}
    identified_tokens = [indexed_words[token] for token in tokens]
    return identified_tokens

# Step 3: Syntax Analysis using NLTK
def syntax_analysis(input_text):
    words = word_tokenize(input_text)
    tagged = pos_tag(words)  # POS tagging for Punjabi
    return tagged

# Step 4: Semantic Analysis (simple example)
def semantic_analysis(tagged_tokens):
    semantics = {}
    for word, tag in tagged_tokens:
        semantics[word] = tag  # Just storing word and its tag
    return semantics

# Example input text in Punjabi Shahmukhi
input_text = "کالا کتا تیز بھاگدا اے۔"

# Applying the finite state approach
tokens, word_index = tokenize_text(input_text)
identified_tokens = lexical_analysis(tokens, word_index)
tagged_tokens = syntax_analysis(input_text)
semantics = semantic_analysis(tagged_tokens)

# Output results
print("Tokens:", tokens)
print("Word Index:", word_index)
print("Identified Tokens:", identified_tokens)
print("Tagged Tokens (Syntax Analysis):", tagged_tokens)
print("Semantic Analysis:", semantics)

Tokens: [1, 2, 3, 4, 5]
Word Index: {'کالا': 1, 'کتا': 2, 'تیز': 3, 'بھاگدا': 4, 'اے۔': 5}
Identified Tokens: ['کالا', 'کتا', 'تیز', 'بھاگدا', 'اے۔']
Tagged Tokens (Syntax Analysis): [('کالا', 'JJ'), ('کتا', 'NNP'), ('تیز', 'NNP'), ('بھاگدا', 'NNP'), ('اے۔', 'NN')]
Semantic Analysis: {'کالا': 'JJ', 'کتا': 'NNP', 'تیز': 'NNP', 'بھاگدا': 'NNP', 'اے۔': 'NN'}


## Train Bidirectional LSTM (Long Short-Term Memory) model 

In [None]:
# Define the model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer_surface.word_index) + 1, output_dim=128, input_length=max_surface_len))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(len(tokenizer_deep.word_index) + 1, activation='softmax'))

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/50
[1m3175/3175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m216s[0m 68ms/step - accuracy: 0.2302 - loss: 2.1488 - val_accuracy: 0.3525 - val_loss: 1.6190
Epoch 2/50
[1m3175/3175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m215s[0m 68ms/step - accuracy: 0.4541 - loss: 1.3694 - val_accuracy: 0.3806 - val_loss: 1.5558
Epoch 3/50
[1m3175/3175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m214s[0m 68ms/step - accuracy: 0.5269 - loss: 1.1261 - val_accuracy: 0.3746 - val_loss: 1.6347
Epoch 4/50
[1m3175/3175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m230s[0m 72ms/step - accuracy: 0.5441 - loss: 1.0208 - val_accuracy: 0.3585 - val_loss: 1.8262
Epoch 5/50
[1m3175/3175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m229s[0m 72ms/step - accuracy: 0.5465 - loss: 0.9691 - val_accuracy: 0.3678 - val_loss: 2.0149
Epoch 6/50
[1m3175/3175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m296s[0m 93ms/step - accuracy: 0.5465 - loss: 0.9356 - val_accuracy: 0.3435 - val_loss: 2.226

## Save Trained Model

In [None]:
model.save("punjabi_morphological_analyzer.h5")