### RAG Pipelines - Data Ingestion to Vector DB Pipeline

In [1]:
import os
from langchain_community.document_loaders import PyMuPDFLoader, PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
### Read all the pdfs inside the directory
def process_all_pdfs(pdf_directory):
    all_documents = []
    pdf_dir = Path(pdf_directory)
    pdf_files = list(pdf_dir.glob("**/*.pdf"))

    print(f"Found {len(pdf_files)} PDF files to process")

    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")

        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()

            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'

            all_documents.extend(documents)
            print(f" ✅ Loaded {len(documents)} pages")

        except Exception as e:
            print(f" ❌ Error: {e}")
    
    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

# Process all pdfs
all_pdf_documents = process_all_pdfs("../data")

Found 5 PDF files to process

Processing: 1409.7495v2.pdf
 ✅ Loaded 11 pages

Processing: 1510.02192v1.pdf
 ✅ Loaded 9 pages

Processing: 1712.03935v1.pdf
 ✅ Loaded 13 pages

Processing: 1905.10437v4.pdf
 ✅ Loaded 31 pages

Processing: 2205.07649v2.pdf
 ✅ Loaded 21 pages

Total documents loaded: 85


In [3]:
all_pdf_documents

[Document(metadata={'producer': 'pdfTeX-1.40.12', 'creator': 'LaTeX with hyperref package', 'creationdate': '2015-03-02T01:33:21+00:00', 'author': 'Yaroslav Ganin, Victor Lempitsky', 'keywords': 'Gradient Reversal, Unsupervised Domain Adaptation, Deep Learning', 'moddate': '2015-03-02T01:33:21+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.1415926-2.3-1.40.12 (TeX Live 2011) kpathsea version 6.0.1', 'subject': '', 'title': 'Unsupervised Domain Adaptation by Backpropagation', 'trapped': '/False', 'source': '..\\data\\pdf\\1409.7495v2.pdf', 'total_pages': 11, 'page': 0, 'page_label': '1', 'source_file': '1409.7495v2.pdf', 'file_type': 'pdf'}, page_content='Unsupervised Domain Adaptation by Backpropagation\nYaroslav Ganin GANIN @SKOLTECH .RU\nVictor Lempitsky LEMPITSKY @SKOLTECH .RU\nSkolkovo Institute of Science and Technology (Skoltech)\nAbstract\nTop-performing deep architectures are trained on\nmassive amounts of labeled data. In the absence\nof labeled data for a certain task,

In [4]:
### Text splitting get into chunks
def split_documents(documents, chunk_size=1000, chunk_overlap=200):

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n","\n", " ", ""]
    )

    split_docs = text_splitter.split_documents(documents=documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")

    # Show example of a chunk
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")

    return split_docs

In [5]:
chunks = split_documents(all_pdf_documents)
chunks

Split 85 documents into 383 chunks

Example chunk:
Content: Unsupervised Domain Adaptation by Backpropagation
Yaroslav Ganin GANIN @SKOLTECH .RU
Victor Lempitsky LEMPITSKY @SKOLTECH .RU
Skolkovo Institute of Science and Technology (Skoltech)
Abstract
Top-perfo...
Metadata: {'producer': 'pdfTeX-1.40.12', 'creator': 'LaTeX with hyperref package', 'creationdate': '2015-03-02T01:33:21+00:00', 'author': 'Yaroslav Ganin, Victor Lempitsky', 'keywords': 'Gradient Reversal, Unsupervised Domain Adaptation, Deep Learning', 'moddate': '2015-03-02T01:33:21+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.1415926-2.3-1.40.12 (TeX Live 2011) kpathsea version 6.0.1', 'subject': '', 'title': 'Unsupervised Domain Adaptation by Backpropagation', 'trapped': '/False', 'source': '..\\data\\pdf\\1409.7495v2.pdf', 'total_pages': 11, 'page': 0, 'page_label': '1', 'source_file': '1409.7495v2.pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'pdfTeX-1.40.12', 'creator': 'LaTeX with hyperref package', 'creationdate': '2015-03-02T01:33:21+00:00', 'author': 'Yaroslav Ganin, Victor Lempitsky', 'keywords': 'Gradient Reversal, Unsupervised Domain Adaptation, Deep Learning', 'moddate': '2015-03-02T01:33:21+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.1415926-2.3-1.40.12 (TeX Live 2011) kpathsea version 6.0.1', 'subject': '', 'title': 'Unsupervised Domain Adaptation by Backpropagation', 'trapped': '/False', 'source': '..\\data\\pdf\\1409.7495v2.pdf', 'total_pages': 11, 'page': 0, 'page_label': '1', 'source_file': '1409.7495v2.pdf', 'file_type': 'pdf'}, page_content='Unsupervised Domain Adaptation by Backpropagation\nYaroslav Ganin GANIN @SKOLTECH .RU\nVictor Lempitsky LEMPITSKY @SKOLTECH .RU\nSkolkovo Institute of Science and Technology (Skoltech)\nAbstract\nTop-performing deep architectures are trained on\nmassive amounts of labeled data. In the absence\nof labeled data for a certain task,

### Embedding and vectorStoreDB

In [6]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb, uuid
from chromadb.config import Settings
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
class EmbbeddingManager:
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise
    
    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        if not self.model:
            raise ValueError("Model not loaded")
        
        print(f"Generating embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings
    
    # def get_embedding_dimension(self) -> int:
    #     if not self.model:
    #         raise ValueError("Model not loaded")
    #     return self.model.get_sentence_embedding_dimension()

## initialise the embedding manager
embedding_manager = EmbbeddingManager()
embedding_manager

Loading embedding model: all-MiniLM-L6-v2
Model loaded successfully. Embedding dimension: 384


<__main__.EmbbeddingManager at 0x1fdd38c14c0>

### VectorStore

In [8]:
class VectorStore:
    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        try:
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path = self.persist_directory)

            self.collection = self.client.get_or_create_collection(
                name = self.collection_name,
                metadata={
                "description": "PDF document embedding for RAG",
                "hnsw:space": "cosine"  # cosine distance keeps scores bounded
            }
            )
            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing document in collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents:List[Any], embeddings:np.ndarray):
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match the number of embeddings")
        
        print(f"Adding {len(documents)} documents to vector store...")

        # Prepare data for ChromaDB
        ids = []
        metadatas = []
        document_text = []
        embeddings_list = []

        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)

            document_text.append(doc.page_content)
            embeddings_list.append(embedding.tolist())

        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,

                metadatas=metadatas,
                documents=document_text
            )
            print(f"Successfully added {len(documents)} documents toi vector store")
            print(f"Total documents in collection: {self.collection.count}")
        except Exception as e:
            print(f"Error adding documents to vecctor store: {e}")
            raise

vectorstore = VectorStore()
vectorstore


Vector store initialized. Collection: pdf_documents
Existing document in collection: 383


<__main__.VectorStore at 0x1fdd3a45040>

In [9]:
chunks

[Document(metadata={'producer': 'pdfTeX-1.40.12', 'creator': 'LaTeX with hyperref package', 'creationdate': '2015-03-02T01:33:21+00:00', 'author': 'Yaroslav Ganin, Victor Lempitsky', 'keywords': 'Gradient Reversal, Unsupervised Domain Adaptation, Deep Learning', 'moddate': '2015-03-02T01:33:21+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.1415926-2.3-1.40.12 (TeX Live 2011) kpathsea version 6.0.1', 'subject': '', 'title': 'Unsupervised Domain Adaptation by Backpropagation', 'trapped': '/False', 'source': '..\\data\\pdf\\1409.7495v2.pdf', 'total_pages': 11, 'page': 0, 'page_label': '1', 'source_file': '1409.7495v2.pdf', 'file_type': 'pdf'}, page_content='Unsupervised Domain Adaptation by Backpropagation\nYaroslav Ganin GANIN @SKOLTECH .RU\nVictor Lempitsky LEMPITSKY @SKOLTECH .RU\nSkolkovo Institute of Science and Technology (Skoltech)\nAbstract\nTop-performing deep architectures are trained on\nmassive amounts of labeled data. In the absence\nof labeled data for a certain task,

In [10]:
# Convert text to mbeddings
texts = [doc.page_content for doc in chunks]
embeddings = embedding_manager.generate_embeddings(texts)

# Store in vector database
vectorstore.add_documents(chunks, embeddings)

Generating embeddings for 383 texts...


Batches: 100%|██████████| 12/12 [00:04<00:00,  2.53it/s]


Generated embeddings with shape: (383, 384)
Adding 383 documents to vector store...
Successfully added 383 documents toi vector store
Total documents in collection: <bound method Collection.count of Collection(name=pdf_documents)>


### Retriever Pipeline From Vector Store

In [11]:
# Create a class for RAG retrieval
class RAGRetriever:
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbbeddingManager, top_k: int = 5):
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager
        self.top_k = top_k

    # Add a score threshold parameter if needed in retrieve function
    def retrieve(self, query: str, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        print(f"Generating embedding for query: {query}")
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]

        print(f"Searching for top {self.top_k} similar documents...")

        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=self.top_k
            )

            retrieved_docs = []

            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]

                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    distance = float(distance)
                    similarity_score = 1 / (1 + distance)  # Map distance to (0,1] to avoid negatives and keep monotonicity
                    if similarity_score < score_threshold:
                        print(f"Skipping document {doc_id} due to low similarity score: {similarity_score}")
                        continue 
                    else:
                        retrieved_docs.append({'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })

                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            
            else:
                print("No documents found.")
            return retrieved_docs
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []

rag_retriever = RAGRetriever(vector_store=vectorstore, embedding_manager=embedding_manager, top_k=5)
rag_retriever


<__main__.RAGRetriever at 0x1fdd3af5eb0>

In [12]:
rag_retriever.retrieve("What is CNN training procedure described under Unsupervised Domain Adaptation?")

Generating embedding for query: What is CNN training procedure described under Unsupervised Domain Adaptation?
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 123.97it/s]

Generated embeddings with shape: (1, 384)
Searching for top 5 similar documents...
Retrieved 5 documents (after filtering)





[{'id': 'doc_d360e825_59',
  'content': 'its in natural images with unsupervised feature learning.\nIn NIPS Workshop on Deep Learning and Unsupervised\nFeature Learning 2011, 2011.\nOquab, M., Bottou, L., Laptev, I., and Sivic, J. Learning\nand transferring mid-level image representations using\nconvolutional neural networks. In CVPR, 2014.\nPan, Sinno Jialin, Tsang, Ivor W., Kwok, James T., and\nYang, Qiang. Domain adaptation via transfer component\nanalysis. IEEE Transactions on Neural Networks, 22(2):\n199–210, 2011.\nS. Chopra, S. Balakrishnan and Gopalan, R. Dlid: Deep\nlearning for domain adaptation by interpolating between\ndomains. In ICML Workshop on Challenges in Repre-\nsentation Learning, 2013.\nSaenko, Kate, Kulis, Brian, Fritz, Mario, and Darrell,\nTrevor. Adapting visual category models to new do-\nmains. In ECCV, pp. 213–226. 2010.\nShimodaira, Hidetoshi. Improving predictive inference un-\nder covariate shift by weighting the log-likelihood func-\ntion. Journal of Stat

In [15]:
# Simple RAG Pipeline with Groq LLM
from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv
load_dotenv()

# Initialize Groq LLM
groq_api_key = os.environ.get("GROQ_API_KEY")
if not groq_api_key:
    raise ValueError("Set GROQ_API_KEY in your .env file")

llm = ChatGroq(api_key=groq_api_key, model="qwen/qwen3-32b", temperature=0.1, max_tokens=1024)

# Simple RAG function: retrieve context + generate response
def rag_simple(query: str, retriever: RAGRetriever, llm: ChatGroq) -> str:
    # Retrieve relevant documents
    retrieved_docs = retriever.retrieve(query)

    # Combine retrieved documents into context
    context = "\n\n".join([doc['content'] for doc in retrieved_docs]) if retrieved_docs else ""

    if not context:
        return "I'm sorry, I couldn't find any relevant information to answer your question."

    # Create prompt for LLM
    prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"

    print(f"Generated Prompt:\n{prompt}\n")

    # Generate response from LLM
    response = llm.invoke([prompt.format(context=context, question=query)])

    return response.content


In [18]:
answer = rag_simple("What is CNN training procedure described under Unsupervised Domain Adaptation?", rag_retriever, llm)
print(f"RAG Answer:\n{answer}")

Generating embedding for query: What is CNN training procedure described under Unsupervised Domain Adaptation?
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 132.20it/s]

Generated embeddings with shape: (1, 384)
Searching for top 5 similar documents...
Retrieved 5 documents (after filtering)
Generated Prompt:
Context:
its in natural images with unsupervised feature learning.
In NIPS Workshop on Deep Learning and Unsupervised
Feature Learning 2011, 2011.
Oquab, M., Bottou, L., Laptev, I., and Sivic, J. Learning
and transferring mid-level image representations using
convolutional neural networks. In CVPR, 2014.
Pan, Sinno Jialin, Tsang, Ivor W., Kwok, James T., and
Yang, Qiang. Domain adaptation via transfer component
analysis. IEEE Transactions on Neural Networks, 22(2):
199–210, 2011.
S. Chopra, S. Balakrishnan and Gopalan, R. Dlid: Deep
learning for domain adaptation by interpolating between
domains. In ICML Workshop on Challenges in Repre-
sentation Learning, 2013.
Saenko, Kate, Kulis, Brian, Fritz, Mario, and Darrell,
Trevor. Adapting visual category models to new do-
mains. In ECCV, pp. 213–226. 2010.
Shimodaira, Hidetoshi. Improving predictive inf




RAG Answer:
<think>
Okay, let's tackle this question. The user is asking about the CNN training procedure described under Unsupervised Domain Adaptation in the provided context. First, I need to parse through the given information to find relevant details.

Looking at the context, there are several references to domain adaptation techniques. The key paper mentioned is "Unsupervised Domain Adaptation by Backpropagation" by Gong et al. (2013). The figure (Figure 4) shows a semi-supervised approach where labeled target data is used, leading to lower error than using only source or target data. 

The answer should focus on the training procedure outlined in Gong et al.'s work. From the figure's caption, it mentions using both source and target data, with labeled target data being incorporated. The method is called "Connecting the dots with landmarks," which involves discriminatively learning domain-invariant features. 

The procedure likely involves training a CNN to minimize a loss that c

In [20]:
# Enhanced RAG pipeline Features
def rag_advanced(query: str, retriever: RAGRetriever, llm: ChatGroq, score_threshold: float = 0.2, return_context: bool = False) -> str:
    """
    RAG pipeline with extra features:
       returns answers, sources, confidence score and optionally full context.
    """
    # Retrieve relevant documents
    retrieved_docs = retriever.retrieve(query, score_threshold=score_threshold)

    if not retrieved_docs:
        return {"answer": "I'm sorry, I couldn't find any relevant information to answer your question.",
                "sources": [],
                "confidence": 0.0,
                "context": ""
               }
    
    # Prepare context and sources
    context = "\n\n".join([doc['content'] for doc in retrieved_docs]) 
    sources = [{
        'source': doc['metadata'].get('source_file', doc['metadata'].get('doc_index', 'unknown')),
        'page': doc['metadata'].get('page', 'unknown'),
        'score': doc['similarity_score'],
        'preview': doc['content'][:300] + "..."
    } for doc in retrieved_docs]
    confidence = max(doc['similarity_score'] for doc in retrieved_docs)

    # Generate answer using LLM
    prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"

    print(f"Generated Prompt:\n{prompt}\n")

    response = llm.invoke([prompt.format(context=context, question=query)])

    output = {
        "answer": response.content,
        "sources": sources,
        "confidence": confidence
    }

    if return_context:
        output["context"] = context
    return output

# Example usage of advanced RAG
result = rag_advanced("What is CNN training procedure described under Unsupervised Domain Adaptation?", rag_retriever, llm, score_threshold=0.1)
# Show  answwers, sources, confidence and context preview
print(f"Advanced RAG Answer:\n{result['answer']}\n")
print(f"Sources:\n{result['sources']}\n")
print(f"Confidence Score: {result['confidence']}\n")
print(f"Context Preview:\n{result.get('context', '')[:300]}...\n")

Generating embedding for query: What is CNN training procedure described under Unsupervised Domain Adaptation?
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 121.33it/s]


Generated embeddings with shape: (1, 384)
Searching for top 5 similar documents...
Retrieved 5 documents (after filtering)
Generated Prompt:
Context:
its in natural images with unsupervised feature learning.
In NIPS Workshop on Deep Learning and Unsupervised
Feature Learning 2011, 2011.
Oquab, M., Bottou, L., Laptev, I., and Sivic, J. Learning
and transferring mid-level image representations using
convolutional neural networks. In CVPR, 2014.
Pan, Sinno Jialin, Tsang, Ivor W., Kwok, James T., and
Yang, Qiang. Domain adaptation via transfer component
analysis. IEEE Transactions on Neural Networks, 22(2):
199–210, 2011.
S. Chopra, S. Balakrishnan and Gopalan, R. Dlid: Deep
learning for domain adaptation by interpolating between
domains. In ICML Workshop on Challenges in Repre-
sentation Learning, 2013.
Saenko, Kate, Kulis, Brian, Fritz, Mario, and Darrell,
Trevor. Adapting visual category models to new do-
mains. In ECCV, pp. 213–226. 2010.
Shimodaira, Hidetoshi. Improving predictive inf

In [26]:
# --- Advanced RAG Pipeline: Streaming, Citations, History, Summarization ---
from typing import List, Dict, Any
import time

class AdvancedRAGPipeline:
    def __init__(self, retriever, llm):
        self.retriever = retriever
        self.llm = llm
        self.history = []  # Store query history

    def query(self, question: str, top_k: int = 5, min_score: float = 0.2, stream: bool = False, summarize: bool = False) -> Dict[str, Any]:
        # Retrieve relevant documents
        results = self.retriever.retrieve(question, score_threshold=min_score)
        if not results:
            answer = "No relevant context found."
            sources = []
            context = ""
        else:
            context = "\n\n".join([doc['content'] for doc in results])
            sources = [{
                'source': doc['metadata'].get('source_file', doc['metadata'].get('source', 'unknown')),
                'page': doc['metadata'].get('page', 'unknown'),
                'score': doc['similarity_score'],
                'preview': doc['content'][:120] + '...'
            } for doc in results]
            # Streaming answer simulation
            prompt = f"""Use the following context to answer the question concisely.\nContext:\n{context}\n\nQuestion: {question}\n\nAnswer:"""
            if stream:
                print("Streaming answer:")
                for i in range(0, len(prompt), 80):
                    print(prompt[i:i+80], end='', flush=True)
                    time.sleep(0.05)
                print()
            response = self.llm.invoke([prompt.format(context=context, question=question)])
            answer = response.content

        # Add citations to answer
        citations = [f"[{i+1}] {src['source']} (page {src['page']})" for i, src in enumerate(sources)]
        answer_with_citations = answer + "\n\nCitations:\n" + "\n".join(citations) if citations else answer

        # Optionally summarize answer
        summary = None
        if summarize and answer:
            summary_prompt = f"Summarize the following answer in 2 sentences:\n{answer}"
            summary_resp = self.llm.invoke([summary_prompt])
            summary = summary_resp.content

        # Store query history
        self.history.append({
            'question': question,
            'answer': answer,
            'sources': sources,
            'summary': summary
        })

        return {
            'question': question,
            'answer': answer_with_citations,
            'sources': sources,
            'summary': summary,
            'history': self.history
        }

# Example usage:
adv_rag = AdvancedRAGPipeline(rag_retriever, llm)
result = adv_rag.query("What is CNN training procedure described under Unsupervised Domain Adaptation?", top_k=5, min_score=0.1, stream=True, summarize=True)
print("\nFinal Answer:", result['answer'])
print("Summary:", result['summary'])
print("History:", result['history'][-1])

Generating embedding for query: What is CNN training procedure described under Unsupervised Domain Adaptation?
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 122.83it/s]

Generated embeddings with shape: (1, 384)
Searching for top 5 similar documents...
Retrieved 5 documents (after filtering)
Streaming answer:
Use the following context to answer the question concisely.
Context:
its in natural images with unsupervised feature learning.
In NIPS Workshop on Deep Learning and Unsupervised
Feature Learning 2011, 2011.
Oquab, M., Bottou, L., Laptev, I., and Sivic, J. Learning
and transferring mid-level image representations using





convolutional neural networks. In CVPR, 2014.
Pan, Sinno Jialin, Tsang, Ivor W., Kwok, James T., and
Yang, Qiang. Domain adaptation via transfer component
analysis. IEEE Transactions on Neural Networks, 22(2):
199–210, 2011.
S. Chopra, S. Balakrishnan and Gopalan, R. Dlid: Deep
learning for domain adaptation by interpolating between
domains. In ICML Workshop on Challenges in Repre-
sentation Learning, 2013.
Saenko, Kate, Kulis, Brian, Fritz, Mario, and Darrell,
Trevor. Adapting visual category models to new do-
mains. In ECCV, pp. 213–226. 2010.
Shimodaira, Hidetoshi. Improving predictive inference un-
der covariate shift by weighting the log-likelihood func-
tion. Journal of Statistical Planning and Inference , 90

its in natural images with unsupervised feature learning.
In NIPS Workshop on Deep Learning and Unsupervised
Feature Learning 2011, 2011.
Oquab, M., Bottou, L., Laptev, I., and Sivic, J. Learning
and transferring mid-level image representations using
convolutional neural ne