In [None]:
!pip install wordcloud
!pip install matplotlib
!pip install nltk

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
import re

# Load text (you can replace this with loading from a file)
text = """Natural Language Processing with Python is a very interesting field. This book is meant for students and enthusiasts who are interested in learning NLP."""

# Clean the text
text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
text = text.lower()  # Convert text to lowercase

# Tokenize and remove stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in text.split() if word not in stop_words]
clean_text = ' '.join(filtered_words)

from collections import Counter

# Calculate word frequencies
word_freq = Counter(filtered_words)

# Create a word cloud object
wordcloud = WordCloud(width=800, height=400, max_words=100, background_color='white', colormap='viridis')

# Generate the word cloud from the word frequencies
wordcloud.generate_from_frequencies(word_freq)

# Plot the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')  # Turn off axis
plt.show()

# Save the word cloud to a file
wordcloud.to_file("wordcloud_output.png")


In [None]:
!pip install scikit-learn
!pip install matplotlib
!pip install seaborn
!pip install nltk


In [None]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import matplotlib.pyplot as plt

nltk.download('stopwords')
from nltk.corpus import stopwords
import re

# Sample documents
documents = [
    "Natural Language Processing enables machines to understand human language.",
    "Deep learning techniques are widely used in NLP applications.",
    "Natural Language Processing and Deep Learning are closely connected.",
    "Support Vector Machines are another approach to machine learning.",
    "Understanding human language is a key focus in AI research."
]

# Preprocess the documents
stop_words = set(stopwords.words('english'))
def preprocess(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

processed_docs = [preprocess(doc) for doc in documents]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(processed_docs)
similarity_matrix = cosine_similarity(tfidf_matrix)
plt.figure(figsize=(10, 8))
sns.heatmap(similarity_matrix, annot=True, cmap='coolwarm', xticklabels=['Doc1', 'Doc2', 'Doc3', 'Doc4', 'Doc5'], 
            yticklabels=['Doc1', 'Doc2', 'Doc3', 'Doc4', 'Doc5'])
plt.title("Document Similarity Heatmap")
plt.show()


In [None]:
!pip install pandas
!pip install matplotlib
!pip install seaborn
!pip install textblob
!pip install nltk


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
import nltk
import re
# Example data loading (you can replace it with your dataset)
data = pd.read_csv('sample_tweets.csv')  # Assume 'sample_tweets.csv' has 'date' and 'tweet' columns

# Preprocessing function
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
def preprocess_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^A-Za-z\s]', '', text)  # Remove punctuation and special characters
    text = text.lower()  # Convert to lowercase
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

data['cleaned_tweet'] = data['tweet'].apply(preprocess_text)
def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

data['sentiment'] = data['cleaned_tweet'].apply(get_sentiment)
# Convert 'date' column to datetime
data['date'] = pd.to_datetime(data['date'])

# Group by date and calculate mean sentiment
sentiment_over_time = data.groupby(data['date'].dt.date)['sentiment'].mean().reset_index()
plt.figure(figsize=(12, 6))
sns.lineplot(x='date', y='sentiment', data=sentiment_over_time, color='blue')
plt.title('Sentiment Timeline Over Time')
plt.xlabel('Date')
plt.ylabel('Average Sentiment Score')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()


In [None]:
!pip install spacy
!pip install nltk
!pip install matplotlib
!python -m spacy download en_core_web_sm

In [None]:
import spacy
import nltk
from nltk.tree import Tree
from spacy import displacy
import matplotlib.pyplot as plt
# Load the SpaCy model
nlp = spacy.load('en_core_web_sm')

# Define the sentence to parse
sentence = "The quick brown fox jumps over the lazy dog."
# Parse the sentence
doc = nlp(sentence)
def to_nltk_tree(node):
    if node.n_lefts + node.n_rights > 0:
        return Tree(node.orth_, [to_nltk_tree(child) for child in node.children])
    else:
        return node.orth_

# Convert the root token to an NLTK tree
nltk_tree = to_nltk_tree(list(doc.sents)[0].root)
# Visualize using NLTK
nltk_tree.draw()

# Alternative visualization using SpaCy's displacy
displacy.render(doc, style="dep", jupyter=True)


In [None]:
!pip install gensim
!pip install sklearn
!pip install matplotlib
!pip install seaborn


In [None]:
import gensim.downloader as api
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
# Load the pre-trained Word2Vec model
model = api.load("glove-wiki-gigaword-100")  # GloVe embeddings with 100 dimensions
words = ['king', 'queen', 'man', 'woman', 'apple', 'banana', 'fruit', 'orange', 'lion', 'tiger', 'cat', 'dog', 'animal']
word_vectors = [model[word] for word in words if word in model]
# First reduction with PCA
pca = PCA(n_components=50)
word_vectors_pca = pca.fit_transform(word_vectors)

# Further reduction with t-SNE
tsne = TSNE(n_components=2, perplexity=5, n_iter=500, random_state=42)
word_vectors_2d = tsne.fit_transform(word_vectors_pca)
embedding_df = pd.DataFrame(word_vectors_2d, columns=['x', 'y'])
embedding_df['word'] = words
plt.figure(figsize=(10, 8))
sns.scatterplot(x='x', y='y', data=embedding_df, hue='word', palette='viridis')

# Annotate each point with its word label
for i in range(len(embedding_df)):
    plt.text(embedding_df['x'][i] + 0.05, embedding_df['y'][i] + 0.05, embedding_df['word'][i], fontsize=9)

plt.title('2D Projection of Word Embeddings')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.show()


In [None]:
!pip install gensim
!pip install sklearn
!pip install matplotlib
!pip install seaborn
!pip install nltk
!pip install pandas


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gensim
from gensim import corpora
from sklearn.preprocessing import normalize
import nltk
from nltk.corpus import stopwords
import re
# Load sample data
data = pd.read_csv('documents.csv')  # Assume 'documents.csv' has a column 'text'

# Preprocessing function
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    tokens = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
    return tokens

# Apply preprocessing
data['processed_text'] = data['text'].apply(preprocess_text)
# Create dictionary and corpus
dictionary = corpora.Dictionary(data['processed_text'])
corpus = [dictionary.doc2bow(text) for text in data['processed_text']]

# Train LDA model
num_topics = 5
lda_model = gensim.models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)
# Extract topic distributions
topic_distributions = []
for doc in corpus:
    topic_distribution = lda_model.get_document_topics(doc, minimum_probability=0)
    topic_distributions.append([prob for _, prob in topic_distribution])

# Convert to DataFrame
topic_df = pd.DataFrame(topic_distributions, columns=[f'Topic_{i+1}' for i in range(num_topics)])
# Normalize the topic distributions
topic_df_normalized = pd.DataFrame(normalize(topic_df, axis=1), columns=topic_df.columns)
topic_df_normalized['Document'] = data.index
# Plot heatmap of topic distributions
plt.figure(figsize=(12, 8))
sns.heatmap(topic_df_normalized.set_index('Document').T, cmap='viridis', cbar=True)
plt.title('Heatmap of Topic Distributions Across Documents')
plt.xlabel('Document ID')
plt.ylabel('Topic')
plt.show()

# Plot stacked bar chart
topic_df_normalized.set_index('Document').plot(kind='bar', stacked=True, figsize=(12, 6), colormap='Set3')
plt.title('Stacked Bar Chart of Topic Proportions per Document')
plt.xlabel('Document ID')
plt.ylabel('Proportion of Topics')
plt.show()


In [None]:
!pip install spacy
!pip install networkx
!pip install matplotlib
!pip install pyvis
!python -m spacy download en_core_web_sm


In [None]:
import spacy
import networkx as nx
import matplotlib.pyplot as plt
from pyvis.network import Network
# Load SpaCy model
nlp = spacy.load('en_core_web_sm')

# Define the text
text = """Google was founded by Larry Page and Sergey Brin while they were students at Stanford University in California. 
Elon Musk is the CEO of SpaceX and Tesla, which are based in the United States."""
# Extract entities and create relationships
doc = nlp(text)
entity_pairs = []

for sent in doc.sents:
    entities = [ent.text for ent in sent.ents]
    if len(entities) > 1:
        for i in range(len(entities)):
            for j in range(i + 1, len(entities)):
                entity_pairs.append((entities[i], entities[j]))
# Create a graph using networkx
G = nx.Graph()

# Add nodes and edges to the graph
for entity1, entity2 in entity_pairs:
    G.add_node(entity1)
    G.add_node(entity2)
    G.add_edge(entity1, entity2)
# Plot with Matplotlib
plt.figure(figsize=(10, 10))
pos = nx.spring_layout(G)
nx.draw(G, pos, with_labels=True, node_color='skyblue', edge_color='black', node_size=1500, font_size=10)
plt.title('Entity Relationship Graph')
plt.show()

# Interactive visualization with Pyvis
net = Network(notebook=True)
net.from_nx(G)
net.show("entity_relationship_graph.html")


In [None]:
!pip install spacy
!pip install pandas
!pip install matplotlib
!pip install seaborn
!python -m spacy download en_core_web_sm


In [None]:
import spacy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
# Load sample data
data = pd.read_csv('documents.csv')  # Assume 'documents.csv' has columns 'date' and 'text'

# Preprocessing function
def preprocess_text(text):
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    return text

# Apply preprocessing
data['cleaned_text'] = data['text'].apply(preprocess_text)
# Load SpaCy model
nlp = spacy.load('en_core_web_sm')

# Extract named entities from each document
def extract_entities(text):
    doc = nlp(text)
    entities = [ent.text for ent in doc.ents if ent.label_ in ['PERSON', 'ORG', 'GPE']]
    return entities

# Apply entity extraction
data['entities'] = data['cleaned_text'].apply(extract_entities)
# Count entity mentions across documents
entity_counts = Counter()
for entities in data['entities']:
    entity_counts.update(entities)

# Convert to DataFrame for easy manipulation
entity_count_df = pd.DataFrame(entity_counts.items(), columns=['Entity', 'Count'])
# Explode the list of entities for individual mentions
data_exploded = data.explode('entities')

# Aggregate entity mentions over time
time_agg = data_exploded.groupby(['date', 'entities']).size().reset_index(name='mention_count')
# Example: Visualize the trend of an entity over time
entity_to_plot = 'Google'
entity_trend = time_agg[time_agg['entities'] == entity_to_plot]

plt.figure(figsize=(10, 6))
sns.lineplot(x='date', y='mention_count', data=entity_trend)
plt.title(f'Mention Trend of "{entity_to_plot}" Over Time')
plt.xlabel('Date')
plt.ylabel('Mention Count')
plt.xticks(rotation=45)
plt.show()

# Example: Visualize co-occurrence of top entities using a heatmap
top_entities = entity_count_df.nlargest(10, 'Count')['Entity']
co_occurrence_matrix = pd.crosstab(data_exploded['date'], data_exploded['entities'])
plt.figure(figsize=(12, 8))
sns.heatmap(co_occurrence_matrix[top_entities], cmap='YlGnBu', annot=True)
plt.title('Heatmap of Entity Mentions Across Documents')
plt.xlabel('Entities')
plt.ylabel('Dates')
plt.show()


In [None]:
!pip install sklearn
!pip install matplotlib
!pip install seaborn
!pip install nltk

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, precision_recall_curve
import nltk
import re
# Load dataset (for example, a CSV file with columns 'text' and 'label')
data = pd.read_csv('text_classification_data.csv')

# Preprocessing function
def preprocess_text(text):
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    return text

# Apply preprocessing
data['cleaned_text'] = data['text'].apply(preprocess_text)
# Convert text to numerical format
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(data['cleaned_text'])

# Define labels
y = data['label']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train, y_train)
# Predict labels for the test set
y_pred = classifier.predict(X_test)

# Generate classification report
print(classification_report(y_test, y_pred))
# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot confusion matrix using Seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=classifier.classes_, yticklabels=classifier.classes_)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()
# Generate precision-recall curve
precision, recall, _ = precision_recall_curve(y_test, classifier.predict_proba(X_test)[:, 1])
plt.figure(figsize=(10, 6))
plt.plot(recall, precision, color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.show()

# Generate ROC curve
fpr, tpr, _ = roc_curve(y_test, classifier.predict_proba(X_test)[:, 1])
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='r', label=f'ROC Curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()


In [None]:
!pip install nltk
!pip install wordcloud
!pip install matplotlib
!pip install seaborn
!pip install pandas


In [None]:
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import re
# Load sample text data (for example, a CSV file with a column 'text')
data = pd.read_csv('text_data.csv')

# Preprocessing function
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    tokens = [word for word in text.split() if word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing
data['cleaned_text'] = data['text'].apply(preprocess_text)
# Calculate word frequencies
word_freq = pd.Series(' '.join(data['cleaned_text']).split()).value_counts()

# Perform sentiment analysis (for demonstration, we'll assign random values as sentiment scores)
import numpy as np
data['sentiment'] = np.random.uniform(-1, 1, len(data))  # Random sentiment scores for illustration
# Generate a word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(data['cleaned_text']))
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Most Frequent Words')
plt.show()

# Bar chart for top 10 word frequencies
top_words = word_freq[:10]
plt.figure(figsize=(10, 6))
sns.barplot(x=top_words.values, y=top_words.index, palette='viridis')
plt.title('Top 10 Most Frequent Words')
plt.xlabel('Frequency')
plt.ylabel('Words')
plt.show()

# Line chart for sentiment over time
plt.figure(figsize=(12, 6))
plt.plot(data.index, data['sentiment'], color='blue', linestyle='-', marker='o')
plt.title('Sentiment Trend Over Time')
plt.xlabel('Document Index')
plt.ylabel('Sentiment Score')
plt.grid(True)
plt.show()
# Create a combined infographic with subplots
fig, axs = plt.subplots(2, 2, figsize=(18, 12))

# Word Cloud
axs[0, 0].imshow(wordcloud, interpolation='bilinear')
axs[0, 0].axis('off')
axs[0, 0].set_title('Word Cloud of Most Frequent Words')

# Bar Chart
sns.barplot(x=top_words.values, y=top_words.index, palette='viridis', ax=axs[0, 1])
axs[0, 1].set_title('Top 10 Most Frequent Words')
axs[0, 1].set_xlabel('Frequency')
axs[0, 1].set_ylabel('Words')

# Line Chart for Sentiment Trend
axs[1, 0].plot(data.index, data['sentiment'], color='blue', linestyle='-', marker='o')
axs[1, 0].set_title('Sentiment Trend Over Time')
axs[1, 0].set_xlabel('Document Index')
axs[1, 0].set_ylabel('Sentiment Score')
axs[1, 0].grid(True)

# Placeholder for future elements (e.g., pie chart or other summary statistics)
axs[1, 1].text(0.5, 0.5, 'Additional Summary or Visualization Here', 
               horizontalalignment='center', verticalalignment='center', fontsize=14)
axs[1, 1].set_axis_off()

plt.tight_layout()
plt.show()


In [None]:
!pip install sklearn
!pip install matplotlib
!pip install seaborn
!pip install nltk
!pip install umap-learn
!pip install gensim


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap
import nltk
import re
from gensim.models import Word2Vec
# Load dataset (for example, a CSV file with a column 'text')
data = pd.read_csv('text_data.csv')

# Preprocessing function
nltk.download('stopwords')
stop_words = set(nltk.corpus.stopwords.words('english'))

def preprocess_text(text):
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    tokens = [word for word in text.split() if word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing
data['cleaned_text'] = data['text'].apply(preprocess_text)
# Convert text to TF-IDF vectors
vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(data['cleaned_text'])

# Alternatively, generate Word2Vec embeddings
sentences = [text.split() for text in data['cleaned_text']]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
X_word2vec = np.array([np.mean([word2vec_model.wv[word] for word in sentence if word in word2vec_model.wv]
                                or [np.zeros(100)], axis=0) for sentence in sentences])
# Initial dimensionality reduction with PCA
pca = PCA(n_components=50)
X_pca = pca.fit_transform(X_tfidf.toarray())

# Further reduction with t-SNE
tsne = TSNE(n_components=2, perplexity=30, n_iter=500, random_state=42)
X_tsne = tsne.fit_transform(X_pca)

# Alternatively, use UMAP for reduction
umap_model = umap.UMAP(n_neighbors=15, n_components=2, random_state=42)
X_umap = umap_model.fit_transform(X_pca)
# Create DataFrame for visualization
df_tsne = pd.DataFrame(X_tsne, columns=['Dimension 1', 'Dimension 2'])
df_tsne['label'] = data['label']  # Assume there's a 'label' column for categorization

df_umap = pd.DataFrame(X_umap, columns=['Dimension 1', 'Dimension 2'])
df_umap['label'] = data['label']
# Visualize t-SNE result
plt.figure(figsize=(12, 8))
sns.scatterplot(x='Dimension 1', y='Dimension 2', hue='label', palette='viridis', data=df_tsne, s=60, alpha=0.7)
plt.title('t-SNE Visualization of Text Data')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.legend(title='Category')
plt.show()

# Visualize UMAP result
plt.figure(figsize=(12, 8))
sns.scatterplot(x='Dimension 1', y='Dimension 2', hue='label', palette='coolwarm', data=df_umap, s=60, alpha=0.7)
plt.title('UMAP Visualization of Text Data')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.legend(title='Category')
plt.show()